bio-workflows-scrnaseq-pipeline
SKILL.md
Single-Cell RNA-seq Pipeline
Complete workflow from 10X Genomics Cell Ranger output to annotated cell types.
Workflow Overview
10X data (filtered_feature_bc_matrix)
|
v
[1. Load Data] ---------> Read10X / read_10x_h5
|
v
[2. QC Filtering] ------> nFeature, percent.mt, doublets
|
v
[3. Normalization] -----> SCTransform or LogNormalize
|
v
[4. HVG Selection] -----> FindVariableFeatures
|
v
[5. Dim Reduction] -----> PCA → UMAP
|
v
[6. Clustering] --------> FindNeighbors → FindClusters
|
v
[7. Markers] -----------> FindAllMarkers
|
v
[8. Annotation] --------> Manual or automated
|
v
Annotated Seurat/AnnData object
Primary Path: Seurat (R)
Step 1: Load 10X Data
library(Seurat)
library(ggplot2)
library(dplyr)
# Load from Cell Ranger output
data_dir <- 'cellranger_output/filtered_feature_bc_matrix'
counts <- Read10X(data.dir = data_dir)
# Create Seurat object
seurat_obj <- CreateSeuratObject(counts = counts, project = 'my_project',
min.cells = 3, min.features = 200)
Step 2: Quality Control
# Calculate QC metrics
seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-')
seurat_obj[['percent.ribo']] <- PercentageFeatureSet(seurat_obj, pattern = '^RP[SL]')
# Visualize QC metrics
VlnPlot(seurat_obj, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3)
# Filter cells
seurat_obj <- subset(seurat_obj,
nFeature_RNA > 200 &
nFeature_RNA < 5000 &
percent.mt < 20 &
nCount_RNA > 500)
cat('Cells after QC:', ncol(seurat_obj), '\n')
QC Checkpoint 1: Review QC plots
- Remove cells with very low/high gene counts
- Remove cells with high mitochondrial content (dying cells)
Step 3: Doublet Detection
library(scDblFinder)
# Convert to SCE for scDblFinder
sce <- as.SingleCellExperiment(seurat_obj)
sce <- scDblFinder(sce)
# Add back to Seurat
seurat_obj$doublet_class <- sce$scDblFinder.class
seurat_obj$doublet_score <- sce$scDblFinder.score
# Remove doublets
seurat_obj <- subset(seurat_obj, doublet_class == 'singlet')
cat('Cells after doublet removal:', ncol(seurat_obj), '\n')
Step 4: Normalization with SCTransform
# SCTransform (recommended for most analyses)
seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)
Alternative: Standard normalization
seurat_obj <- NormalizeData(seurat_obj)
seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = 'vst', nfeatures = 2000)
seurat_obj <- ScaleData(seurat_obj, vars.to.regress = 'percent.mt')
Step 5: Dimensionality Reduction
# PCA
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE)
# Determine optimal PCs
ElbowPlot(seurat_obj, ndims = 50)
# UMAP
n_pcs <- 30 # Choose based on elbow plot
seurat_obj <- RunUMAP(seurat_obj, dims = 1:n_pcs, verbose = FALSE)
Step 6: Clustering
# Find neighbors
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:n_pcs, verbose = FALSE)
# Find clusters (try multiple resolutions)
seurat_obj <- FindClusters(seurat_obj, resolution = c(0.2, 0.4, 0.6, 0.8, 1.0), verbose = FALSE)
# Visualize
DimPlot(seurat_obj, reduction = 'umap', group.by = 'SCT_snn_res.0.4', label = TRUE)
QC Checkpoint 2: Assess clustering
- Clusters should be visually separable on UMAP
- Resolution 0.4-0.8 is often appropriate
Step 7: Find Marker Genes
# Set identity to chosen resolution
Idents(seurat_obj) <- 'SCT_snn_res.0.4'
# Find markers for all clusters
markers <- FindAllMarkers(seurat_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
# Top markers per cluster
top_markers <- markers %>%
group_by(cluster) %>%
slice_max(n = 10, order_by = avg_log2FC)
# Visualize top markers
DoHeatmap(seurat_obj, features = top_markers$gene) + NoLegend()
Step 8: Cell Type Annotation
# Manual annotation based on known markers
# Example for PBMC data:
cluster_annotations <- c(
'0' = 'CD4 T cells',
'1' = 'CD14 Monocytes',
'2' = 'B cells',
'3' = 'CD8 T cells',
'4' = 'NK cells',
'5' = 'CD16 Monocytes',
'6' = 'Dendritic cells'
)
seurat_obj$cell_type <- cluster_annotations[as.character(Idents(seurat_obj))]
# Final UMAP
DimPlot(seurat_obj, reduction = 'umap', group.by = 'cell_type', label = TRUE)
# Save object
saveRDS(seurat_obj, 'seurat_annotated.rds')
Alternative Path: Scanpy (Python)
import scanpy as sc
import numpy as np
# Load 10X data
adata = sc.read_10x_h5('filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()
# QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# Filter
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata = adata[adata.obs.n_genes_by_counts < 5000, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]
# Doublet detection
sc.pp.scrublet(adata)
adata = adata[~adata.obs['predicted_doublet'], :]
# Normalize and HVGs
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
# PCA, neighbors, UMAP
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)
# Clustering
sc.tl.leiden(adata, resolution=0.5)
# Markers
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False)
# Save
adata.write('scanpy_annotated.h5ad')
Parameter Recommendations
| Step | Parameter | Recommendation |
|---|---|---|
| QC | min.features | 200-500 |
| QC | max.features | 2500-5000 (depends on data) |
| QC | percent.mt | <10-20% |
| SCTransform | vars.to.regress | percent.mt |
| PCA | npcs | 30-50 |
| UMAP | dims | 15-30 (check elbow plot) |
| Clustering | resolution | 0.4-0.8 (start with 0.5) |
Troubleshooting
| Issue | Likely Cause | Solution |
|---|---|---|
| All cells filtered | QC too strict | Relax thresholds |
| Poor UMAP separation | Too few HVGs or PCs | Increase nfeatures, check n_pcs |
| Too many/few clusters | Wrong resolution | Adjust resolution parameter |
| Unknown cell types | Missing markers | Check known marker genes manually |
Complete R Workflow
library(Seurat)
library(scDblFinder)
library(ggplot2)
library(dplyr)
# Configuration
data_dir <- 'filtered_feature_bc_matrix'
output_dir <- 'results'
dir.create(output_dir, showWarnings = FALSE)
# Load
counts <- Read10X(data.dir = data_dir)
seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200)
cat('Initial cells:', ncol(seurat_obj), '\n')
# QC
seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-')
seurat_obj <- subset(seurat_obj, nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20)
cat('After QC:', ncol(seurat_obj), '\n')
# Doublets
sce <- as.SingleCellExperiment(seurat_obj)
sce <- scDblFinder(sce)
seurat_obj$doublet <- sce$scDblFinder.class
seurat_obj <- subset(seurat_obj, doublet == 'singlet')
cat('After doublet removal:', ncol(seurat_obj), '\n')
# Normalize
seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)
# Dimension reduction
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE)
seurat_obj <- RunUMAP(seurat_obj, dims = 1:30, verbose = FALSE)
# Cluster
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:30, verbose = FALSE)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.5, verbose = FALSE)
# Markers
markers <- FindAllMarkers(seurat_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
write.csv(markers, file.path(output_dir, 'markers.csv'))
# Save
saveRDS(seurat_obj, file.path(output_dir, 'seurat_object.rds'))
# Plots
pdf(file.path(output_dir, 'umap.pdf'), width = 10, height = 8)
DimPlot(seurat_obj, reduction = 'umap', label = TRUE)
dev.off()
cat('Pipeline complete. Object saved to:', output_dir, '\n')
Related Skills
- single-cell/data-io - Loading different formats
- single-cell/preprocessing - QC details
- single-cell/doublet-detection - Doublet methods comparison
- single-cell/clustering - Clustering parameters
- single-cell/markers-annotation - Annotation strategies
- single-cell/multimodal-integration - CITE-seq, multiome
Weekly Installs
3
Repository
gptomics/bioskillsInstalled on
windsurf2
trae2
opencode2
codex2
claude-code2
antigravity2