skills/gptomics/bioskills/bio-workflows-scrnaseq-pipeline

bio-workflows-scrnaseq-pipeline

SKILL.md

Single-Cell RNA-seq Pipeline

Complete workflow from 10X Genomics Cell Ranger output to annotated cell types.

Workflow Overview

10X data (filtered_feature_bc_matrix)
    |
    v
[1. Load Data] ---------> Read10X / read_10x_h5
    |
    v
[2. QC Filtering] ------> nFeature, percent.mt, doublets
    |
    v
[3. Normalization] -----> SCTransform or LogNormalize
    |
    v
[4. HVG Selection] -----> FindVariableFeatures
    |
    v
[5. Dim Reduction] -----> PCA → UMAP
    |
    v
[6. Clustering] --------> FindNeighbors → FindClusters
    |
    v
[7. Markers] -----------> FindAllMarkers
    |
    v
[8. Annotation] --------> Manual or automated
    |
    v
Annotated Seurat/AnnData object

Primary Path: Seurat (R)

Step 1: Load 10X Data

library(Seurat)
library(ggplot2)
library(dplyr)

# Load from Cell Ranger output
data_dir <- 'cellranger_output/filtered_feature_bc_matrix'
counts <- Read10X(data.dir = data_dir)

# Create Seurat object
seurat_obj <- CreateSeuratObject(counts = counts, project = 'my_project',
                                  min.cells = 3, min.features = 200)

Step 2: Quality Control

# Calculate QC metrics
seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-')
seurat_obj[['percent.ribo']] <- PercentageFeatureSet(seurat_obj, pattern = '^RP[SL]')

# Visualize QC metrics
VlnPlot(seurat_obj, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3)

# Filter cells
seurat_obj <- subset(seurat_obj,
                     nFeature_RNA > 200 &
                     nFeature_RNA < 5000 &
                     percent.mt < 20 &
                     nCount_RNA > 500)

cat('Cells after QC:', ncol(seurat_obj), '\n')

QC Checkpoint 1: Review QC plots

  • Remove cells with very low/high gene counts
  • Remove cells with high mitochondrial content (dying cells)

Step 3: Doublet Detection

library(scDblFinder)

# Convert to SCE for scDblFinder
sce <- as.SingleCellExperiment(seurat_obj)
sce <- scDblFinder(sce)

# Add back to Seurat
seurat_obj$doublet_class <- sce$scDblFinder.class
seurat_obj$doublet_score <- sce$scDblFinder.score

# Remove doublets
seurat_obj <- subset(seurat_obj, doublet_class == 'singlet')
cat('Cells after doublet removal:', ncol(seurat_obj), '\n')

Step 4: Normalization with SCTransform

# SCTransform (recommended for most analyses)
seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)

Alternative: Standard normalization

seurat_obj <- NormalizeData(seurat_obj)
seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = 'vst', nfeatures = 2000)
seurat_obj <- ScaleData(seurat_obj, vars.to.regress = 'percent.mt')

Step 5: Dimensionality Reduction

# PCA
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE)

# Determine optimal PCs
ElbowPlot(seurat_obj, ndims = 50)

# UMAP
n_pcs <- 30  # Choose based on elbow plot
seurat_obj <- RunUMAP(seurat_obj, dims = 1:n_pcs, verbose = FALSE)

Step 6: Clustering

# Find neighbors
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:n_pcs, verbose = FALSE)

# Find clusters (try multiple resolutions)
seurat_obj <- FindClusters(seurat_obj, resolution = c(0.2, 0.4, 0.6, 0.8, 1.0), verbose = FALSE)

# Visualize
DimPlot(seurat_obj, reduction = 'umap', group.by = 'SCT_snn_res.0.4', label = TRUE)

QC Checkpoint 2: Assess clustering

  • Clusters should be visually separable on UMAP
  • Resolution 0.4-0.8 is often appropriate

Step 7: Find Marker Genes

# Set identity to chosen resolution
Idents(seurat_obj) <- 'SCT_snn_res.0.4'

# Find markers for all clusters
markers <- FindAllMarkers(seurat_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

# Top markers per cluster
top_markers <- markers %>%
    group_by(cluster) %>%
    slice_max(n = 10, order_by = avg_log2FC)

# Visualize top markers
DoHeatmap(seurat_obj, features = top_markers$gene) + NoLegend()

Step 8: Cell Type Annotation

# Manual annotation based on known markers
# Example for PBMC data:
cluster_annotations <- c(
    '0' = 'CD4 T cells',
    '1' = 'CD14 Monocytes',
    '2' = 'B cells',
    '3' = 'CD8 T cells',
    '4' = 'NK cells',
    '5' = 'CD16 Monocytes',
    '6' = 'Dendritic cells'
)

seurat_obj$cell_type <- cluster_annotations[as.character(Idents(seurat_obj))]

# Final UMAP
DimPlot(seurat_obj, reduction = 'umap', group.by = 'cell_type', label = TRUE)

# Save object
saveRDS(seurat_obj, 'seurat_annotated.rds')

Alternative Path: Scanpy (Python)

import scanpy as sc
import numpy as np

# Load 10X data
adata = sc.read_10x_h5('filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()

# QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Filter
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata = adata[adata.obs.n_genes_by_counts < 5000, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]

# Doublet detection
sc.pp.scrublet(adata)
adata = adata[~adata.obs['predicted_doublet'], :]

# Normalize and HVGs
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

# PCA, neighbors, UMAP
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)

# Clustering
sc.tl.leiden(adata, resolution=0.5)

# Markers
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False)

# Save
adata.write('scanpy_annotated.h5ad')

Parameter Recommendations

Step Parameter Recommendation
QC min.features 200-500
QC max.features 2500-5000 (depends on data)
QC percent.mt <10-20%
SCTransform vars.to.regress percent.mt
PCA npcs 30-50
UMAP dims 15-30 (check elbow plot)
Clustering resolution 0.4-0.8 (start with 0.5)

Troubleshooting

Issue Likely Cause Solution
All cells filtered QC too strict Relax thresholds
Poor UMAP separation Too few HVGs or PCs Increase nfeatures, check n_pcs
Too many/few clusters Wrong resolution Adjust resolution parameter
Unknown cell types Missing markers Check known marker genes manually

Complete R Workflow

library(Seurat)
library(scDblFinder)
library(ggplot2)
library(dplyr)

# Configuration
data_dir <- 'filtered_feature_bc_matrix'
output_dir <- 'results'
dir.create(output_dir, showWarnings = FALSE)

# Load
counts <- Read10X(data.dir = data_dir)
seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200)
cat('Initial cells:', ncol(seurat_obj), '\n')

# QC
seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-')
seurat_obj <- subset(seurat_obj, nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20)
cat('After QC:', ncol(seurat_obj), '\n')

# Doublets
sce <- as.SingleCellExperiment(seurat_obj)
sce <- scDblFinder(sce)
seurat_obj$doublet <- sce$scDblFinder.class
seurat_obj <- subset(seurat_obj, doublet == 'singlet')
cat('After doublet removal:', ncol(seurat_obj), '\n')

# Normalize
seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)

# Dimension reduction
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE)
seurat_obj <- RunUMAP(seurat_obj, dims = 1:30, verbose = FALSE)

# Cluster
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:30, verbose = FALSE)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.5, verbose = FALSE)

# Markers
markers <- FindAllMarkers(seurat_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
write.csv(markers, file.path(output_dir, 'markers.csv'))

# Save
saveRDS(seurat_obj, file.path(output_dir, 'seurat_object.rds'))

# Plots
pdf(file.path(output_dir, 'umap.pdf'), width = 10, height = 8)
DimPlot(seurat_obj, reduction = 'umap', label = TRUE)
dev.off()

cat('Pipeline complete. Object saved to:', output_dir, '\n')

Related Skills

  • single-cell/data-io - Loading different formats
  • single-cell/preprocessing - QC details
  • single-cell/doublet-detection - Doublet methods comparison
  • single-cell/clustering - Clustering parameters
  • single-cell/markers-annotation - Annotation strategies
  • single-cell/multimodal-integration - CITE-seq, multiome
Weekly Installs
3
Installed on
windsurf2
trae2
opencode2
codex2
claude-code2
antigravity2