bio-expression-matrix-gene-id-mapping
SKILL.md
Gene ID Mapping
Python: mygene
import mygene
import pandas as pd
mg = mygene.MyGeneInfo()
# Ensembl to Symbol
ensembl_ids = ['ENSG00000141510', 'ENSG00000012048', 'ENSG00000141736']
results = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields='symbol', species='human')
mapping = {r['query']: r.get('symbol', None) for r in results}
# {'ENSG00000141510': 'TP53', 'ENSG00000012048': 'BRCA1', 'ENSG00000141736': 'ERBB2'}
# Symbol to Entrez
symbols = ['TP53', 'BRCA1', 'ERBB2']
results = mg.querymany(symbols, scopes='symbol', fields='entrezgene', species='human')
mapping = {r['query']: r.get('entrezgene', None) for r in results}
# Ensembl to multiple fields
results = mg.querymany(ensembl_ids, scopes='ensembl.gene',
fields=['symbol', 'entrezgene', 'uniprot'], species='human')
Python: pyensembl
from pyensembl import EnsemblRelease
# Load Ensembl release (downloads automatically first time)
ensembl = EnsemblRelease(110, species='human') # or 'mouse'
# Gene ID to symbol
gene = ensembl.gene_by_id('ENSG00000141510')
print(gene.gene_name) # TP53
# Symbol to gene ID
gene = ensembl.genes_by_name('TP53')[0]
print(gene.gene_id) # ENSG00000141510
# Batch conversion
def ensembl_to_symbol(ensembl_ids, release=110):
ens = EnsemblRelease(release, species='human')
mapping = {}
for eid in ensembl_ids:
try:
gene = ens.gene_by_id(eid.split('.')[0]) # Remove version
mapping[eid] = gene.gene_name
except ValueError:
mapping[eid] = None
return mapping
Python: gseapy
import gseapy as gp
# Ensembl to Symbol using Enrichr
gene_list = ['ENSG00000141510', 'ENSG00000012048']
converted = gp.biomart.ensembl2name(gene_list, organism='hsapiens')
R: biomaRt
library(biomaRt)
# Connect to Ensembl
ensembl <- useEnsembl(biomart='genes', dataset='hsapiens_gene_ensembl')
# Ensembl to Symbol
ensembl_ids <- c('ENSG00000141510', 'ENSG00000012048', 'ENSG00000141736')
results <- getBM(
attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id'),
filters='ensembl_gene_id',
values=ensembl_ids,
mart=ensembl
)
# Symbol to Ensembl
symbols <- c('TP53', 'BRCA1', 'ERBB2')
results <- getBM(
attributes=c('hgnc_symbol', 'ensembl_gene_id'),
filters='hgnc_symbol',
values=symbols,
mart=ensembl
)
# All available attributes
listAttributes(ensembl)
R: org.db Packages
library(org.Hs.eg.db) # Human
library(AnnotationDbi)
# Ensembl to Symbol
ensembl_ids <- c('ENSG00000141510', 'ENSG00000012048')
symbols <- mapIds(org.Hs.eg.db, keys=ensembl_ids, keytype='ENSEMBL', column='SYMBOL')
# Symbol to Entrez
symbols <- c('TP53', 'BRCA1')
entrez <- mapIds(org.Hs.eg.db, keys=symbols, keytype='SYMBOL', column='ENTREZID')
# Available keytypes
keytypes(org.Hs.eg.db)
# ENSEMBL, ENSEMBLPROT, ENSEMBLTRANS, ENTREZID, SYMBOL, UNIPROT, etc.
Apply Mapping to Count Matrix
import pandas as pd
import mygene
def map_count_matrix_ids(counts, from_type='ensembl.gene', to_type='symbol', species='human'):
'''Map gene IDs in count matrix index.'''
mg = mygene.MyGeneInfo()
# Remove version numbers from Ensembl IDs
clean_ids = [g.split('.')[0] for g in counts.index]
# Query mygene
results = mg.querymany(clean_ids, scopes=from_type, fields=to_type, species=species)
# Build mapping
mapping = {}
for r in results:
if to_type in r:
mapping[r['query']] = r[to_type]
# Apply mapping
new_index = [mapping.get(g.split('.')[0], g) for g in counts.index]
counts_mapped = counts.copy()
counts_mapped.index = new_index
# Handle duplicates (sum)
counts_mapped = counts_mapped.groupby(counts_mapped.index).sum()
return counts_mapped
# Usage
counts_symbols = map_count_matrix_ids(counts, 'ensembl.gene', 'symbol')
R Equivalent
library(biomaRt)
map_count_matrix_ids <- function(counts, from_type='ensembl_gene_id', to_type='hgnc_symbol') {
ensembl <- useEnsembl(biomart='genes', dataset='hsapiens_gene_ensembl')
# Remove version numbers
clean_ids <- gsub('\\..*', '', rownames(counts))
# Get mapping
mapping <- getBM(
attributes=c(from_type, to_type),
filters=from_type,
values=clean_ids,
mart=ensembl
)
# Merge and aggregate duplicates
counts$gene_id <- clean_ids
merged <- merge(counts, mapping, by.x='gene_id', by.y=from_type, all.x=TRUE)
merged$gene_id <- NULL
# Use symbol as rowname, sum duplicates
rownames(merged) <- merged[[to_type]]
merged[[to_type]] <- NULL
counts_mapped <- aggregate(. ~ rownames(merged), data=merged, FUN=sum)
rownames(counts_mapped) <- counts_mapped[,1]
counts_mapped <- counts_mapped[,-1]
return(counts_mapped)
}
Handle Unmapped IDs
def robust_id_mapping(gene_ids, from_type, to_type, species='human'):
'''Map IDs with fallback for unmapped genes.'''
import mygene
mg = mygene.MyGeneInfo()
clean_ids = [g.split('.')[0] for g in gene_ids]
results = mg.querymany(clean_ids, scopes=from_type, fields=to_type, species=species)
mapping = {}
unmapped = []
for r in results:
original = gene_ids[clean_ids.index(r['query'])]
if to_type in r:
mapping[original] = r[to_type]
else:
mapping[original] = original # Keep original if unmapped
unmapped.append(original)
print(f'Mapped: {len(gene_ids) - len(unmapped)}/{len(gene_ids)}')
print(f'Unmapped: {len(unmapped)}')
return mapping, unmapped
Common ID Types
| Type | Example | Use Case |
|---|---|---|
| Ensembl Gene | ENSG00000141510 | RNA-seq, GTF files |
| Ensembl Transcript | ENST00000269305 | Transcript-level analysis |
| Entrez Gene | 7157 | NCBI databases, KEGG |
| HGNC Symbol | TP53 | Human readable |
| UniProt | P04637 | Protein databases |
| RefSeq | NM_000546 | NCBI RefSeq |
Related Skills
- expression-matrix/counts-ingest - Load count data
- expression-matrix/metadata-joins - Add annotations
- pathway-analysis/go-enrichment - Requires Entrez IDs
- pathway-analysis/kegg-pathways - Requires Entrez IDs
Weekly Installs
3
Repository
gptomics/bioskillsGitHub Stars
349
First Seen
Jan 24, 2026
Security Audits
Installed on
trae2
windsurf1
opencode1
codex1
claude-code1
antigravity1