skills/mims-harvard/tooluniverse/tooluniverse-data-wrangling

tooluniverse-data-wrangling

Installation
SKILL.md

Data Wrangling: Universal Access Patterns

Reference for downloading and parsing scientific data from any source. Write and run Python code via Bash for every step.

When to Use

  • ToolUniverse tool returned metadata/search results but you need raw or bulk data
  • Data is in a format tools don't parse (VCF, h5ad, BAM, SDF, GCT)
  • You need a multi-step API workflow (search -> filter -> download -> parse)
  • The data source has no ToolUniverse tool at all
  • You need thousands of records, not the 10-100 a tool returns

Decision: Tool vs Code

Situation Use
Single record lookup, simple search, <100 results ToolUniverse tool (execute_tool)
Bulk download, custom filtering, format conversion Write Python code
Tool exists but returns truncated results Write code using the same API the tool wraps
No tool exists for this source Write code directly

Section A: Format Cookbook

Tabular

import pandas as pd, io

df = pd.read_csv("data.csv")                                # CSV
df = pd.read_csv("data.tsv", sep="\t")                      # TSV
df = pd.read_sas(io.BytesIO(content), format="xport")       # SAS Transport (XPT) — NHANES, CDC
df = pd.read_sas("data.sas7bdat", format="sas7bdat")        # SAS native
df = pd.read_stata("data.dta")                               # Stata — ICPSR, HRS
df = pd.read_parquet("data.parquet")                         # Parquet — MIMIC-IV
df = pd.read_excel("data.xlsx")                              # Excel
df = pd.read_spss("data.sav")                                # SPSS
df = pd.read_fwf("data.dat")                                 # Fixed-width — legacy surveys

Genomics

from Bio import SeqIO
records = list(SeqIO.parse("seqs.fasta", "fasta"))           # FASTA
records = list(SeqIO.parse("reads.fastq", "fastq"))          # FASTQ

# VCF (no cyvcf2 needed)
vcf_lines = [l for l in open("vars.vcf") if not l.startswith("##")]
df = pd.read_csv(io.StringIO("".join(vcf_lines)), sep="\t")

df = pd.read_csv("genes.gff3", sep="\t", comment="#",        # GFF/GTF
     names=["seqid","source","type","start","end","score","strand","phase","attrs"])
df = pd.read_csv("regions.bed", sep="\t", header=None,       # BED
     names=["chrom","start","end","name","score","strand"])

import pysam                                                  # BAM (requires pysam)
bam = pysam.AlignmentFile("aligned.bam", "rb")
for read in bam.fetch("chr1", 1000, 2000): print(read.query_name)

Structural

from Bio.PDB import PDBParser, MMCIFParser
parser = PDBParser(QUIET=True)
structure = parser.get_structure("prot", "structure.pdb")     # PDB

parser = MMCIFParser(QUIET=True)
structure = parser.get_structure("prot", "structure.cif")     # mmCIF

from rdkit import Chem                                        # SDF/MOL (requires rdkit)
supplier = Chem.SDMolSupplier("compounds.sdf")
mols = [m for m in supplier if m is not None]

Omics Matrices

import anndata
adata = anndata.read_h5ad("expression.h5ad")                 # AnnData (scRNA-seq, spatial)

import scipy.io
mat = scipy.io.mmread("matrix.mtx")                          # 10X Genomics MTX
barcodes = pd.read_csv("barcodes.tsv", header=None)[0].tolist()
features = pd.read_csv("features.tsv", sep="\t", header=None)[1].tolist()

df = pd.read_csv("expression.gct", sep="\t", skiprows=2)     # GCT (gene expression)

import loompy                                                 # Loom (legacy single-cell)
ds = loompy.connect("data.loom")

Mass Spectrometry & Flow Cytometry

from pyteomics import mzml                                    # mzML (proteomics, requires pyteomics)
spectra = list(mzml.read("spectra.mzML"))

import fcsparser                                              # FCS (flow cytometry, requires fcsparser)
meta, data = fcsparser.parse("sample.fcs", reformat_meta=True)

Neuroimaging

import nibabel as nib                                         # NIfTI (requires nibabel)
img = nib.load("brain.nii.gz")
data = img.get_fdata()  # 3D/4D numpy array

# DICOM (requires pydicom)
import pydicom
dcm = pydicom.dcmread("scan.dcm")
pixel_data = dcm.pixel_array

Phylogenetics & Systems Biology

from Bio import Phylo                                         # Newick/Nexus (BioPython)
tree = Phylo.read("tree.nwk", "newick")
tree = Phylo.read("tree.nex", "nexus")

import libsbml                                                # SBML (systems biology, requires python-libsbml)
reader = libsbml.SBMLReader()
doc = reader.readSBML("model.xml")
model = doc.getModel()

Serialized

import json, xml.etree.ElementTree as ET, h5py

data = json.load(open("data.json"))                           # JSON
df = pd.read_json("records.json")                             # JSON -> DataFrame
tree = ET.parse("data.xml"); root = tree.getroot()            # XML
f = h5py.File("data.h5", "r"); dataset = f["group/data"][:]   # HDF5

Compressed

df = pd.read_csv("data.csv.gz")                              # gzip (pandas auto-detects)
df = pd.read_csv("data.tsv.gz", sep="\t")                    # gzip TSV

import zipfile
with zipfile.ZipFile(io.BytesIO(content)) as z:               # ZIP
    df = pd.read_csv(z.open(z.namelist()[0]))

import tarfile
with tarfile.open("archive.tar.gz") as t:                     # tar.gz
    f = t.extractfile(t.getnames()[0])
    df = pd.read_csv(f)

Section B: API Patterns by Domain

Each category shows: which ToolUniverse tools exist, and how to go beyond them with direct API calls.

1. NCBI E-utilities (Gene, Nucleotide, Protein, SRA, GEO)

Tools: NCBI_search_gene, NCBI_search_nucleotide, SRA_search_runs, GEO_search_datasets

import requests
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# Search -> get IDs -> fetch records in batches
ids = requests.get(f"{base}/esearch.fcgi?db=gene&term=BRCA1+AND+human&retmax=500&retmode=json").json()
id_list = ids["esearchresult"]["idlist"]
# Fetch in batches of 500
for i in range(0, len(id_list), 500):
    batch = ",".join(id_list[i:i+500])
    data = requests.get(f"{base}/efetch.fcgi?db=gene&id={batch}&retmode=xml").text

2. EBI APIs (UniProt, PDBe, ChEMBL, Ensembl, InterPro)

Tools: UniProt_search, PDBe_*, ChEMBL_*, Ensembl_*, InterPro_*

# UniProt bulk TSV download with cursor pagination
url = "https://rest.uniprot.org/uniprotkb/search?query=organism_id:9606+AND+keyword:kinase&format=tsv&size=500"
all_rows = []
while url:
    resp = requests.get(url)
    all_rows.append(resp.text)
    url = resp.headers.get("Link", "").split(";")[0].strip("<>") if "Link" in resp.headers else None

3. NCI GDC (TCGA/TARGET Cancer Data)

Tools: GDC_search_cases, GDC_list_files, GDC_get_clinical_data

# Bulk clinical data with filters
filters = {"op":"and","content":[
    {"op":"=","content":{"field":"project.project_id","value":"TCGA-BRCA"}},
    {"op":"=","content":{"field":"demographic.vital_status","value":"Dead"}}
]}
cases = requests.post("https://api.gdc.cancer.gov/cases", json={
    "filters": filters, "fields": "demographic.vital_status,diagnoses.days_to_death",
    "size": 1000, "from": 0
}).json()["data"]["hits"]

4. CDC Health Surveys (NHANES, BRFSS, WONDER)

Tools: NHANES_download_and_parse, cdc_data_search_datasets

# Direct NHANES XPT download (any cycle, any component)
cycle, component = "2017-2018", "DEMO_J"
url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/{component}.XPT"
df = pd.read_sas(io.BytesIO(requests.get(url).content), format="xport")

5. GWAS & Genetics (GWAS Catalog, gnomAD, ClinVar)

Tools: gwas_search_associations, gnomAD_*, ClinVar_*

# GWAS Catalog full download (37MB TSV, all associations)
url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
df = pd.read_csv(url, sep="\t")
# Filter locally
hits = df[df["DISEASE/TRAIT"].str.contains("diabetes", case=False, na=False)]

6. Chemical (PubChem, ChEMBL, KEGG)

Tools: PubChem_*, ChEMBL_*, KEGG_*

# PubChem batch property retrieval (up to 100 CIDs at once)
cids = "2244,5988,3672"  # aspirin, sucrose, ibuprofen
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cids}/property/MolecularWeight,XLogP,TPSA/JSON"
props = requests.get(url).json()["PropertyTable"]["Properties"]

7. Expression (GEO, ArrayExpress, GTEx)

Tools: GEO_search_datasets, ArrayExpress_search

# GEO series matrix direct download
geo_id = "GSE12345"
url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{geo_id[:5]}nnn/{geo_id}/matrix/{geo_id}_series_matrix.txt.gz"
df = pd.read_csv(url, sep="\t", comment="!", index_col=0)

# GTEx bulk expression (median TPM per tissue)
url = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
df = pd.read_csv(url, sep="\t", skiprows=2)

8. Clinical (ClinicalTrials.gov, FDA/OpenFDA, FAERS)

Tools: search_clinical_trials, OpenFDA_*

# ClinicalTrials.gov v2 API with pagination
all_studies = []
token = None
while True:
    params = {"query.cond": "lung cancer", "query.intr": "immunotherapy", "pageSize": 100}
    if token: params["pageToken"] = token
    resp = requests.get("https://clinicaltrials.gov/api/v2/studies", params=params).json()
    all_studies.extend(resp.get("studies", []))
    token = resp.get("nextPageToken")
    if not token: break

9. Literature (PubMed, PMC, EuropePMC)

Tools: PubMed_search_articles, EuropePMC_search_articles

# EuropePMC full-text search with cursor
cursor = "*"
all_results = []
while cursor:
    resp = requests.get("https://www.ebi.ac.uk/europepmc/webservices/rest/search",
        params={"query": "BRCA1 AND resistance", "format": "json", "pageSize": 100, "cursorMark": cursor}).json()
    all_results.extend(resp.get("resultList", {}).get("result", []))
    cursor = resp.get("nextCursorMark") if len(all_results) < resp.get("hitCount", 0) else None

10. Data Repositories (Zenodo, Figshare, Dryad, DataCite)

Tools: DataCite_search_dois, Zenodo_search_records, Dryad_search_datasets

# Zenodo: search + download files
record = requests.get("https://zenodo.org/api/records", params={"q": "proteomics cancer", "size": 5}).json()["hits"]["hits"][0]
for f in record["files"]:
    content = requests.get(f["links"]["self"]).content  # download each file

11-24. Specialized Domains

For these 14 additional domains, read references/specialized-domains.md when you need the specific API pattern:

# Domain Key APIs/Tools When to Read
11 Proteomics PRIDE, MassIVE, ProteomeXchange Mass spec data download
12 Metabolomics MetaboLights, Metabolomics Workbench, HMDB Metabolite/spectra data
13 Microbiome MGnify, GMREPO Metagenome profiles
14 Ecology GBIF, iNaturalist, OBIS Species occurrence data
15 Model Organisms FlyBase, WormBase, ZFIN, RGD Gene data for non-human species
16 Pathways & Networks Reactome, STRING, BioGRID Network/pathway export
17 Ontologies OLS, GO, HPO Term hierarchy traversal
18 Immunology IEDB, VDJdb, ImmPort Epitope/receptor data
19 Drug & Pharma PharmGKB, DGIdb, SIDER Drug-gene interactions
20 Imaging & Atlases TCIA, HPA, Allen Brain Atlas Imaging collections
21 Protein Structure RCSB PDB, AlphaFold PDB/CIF file download
22 Clinical Genomics ClinVar, ClinGen, CIViC Variant interpretation bulk
23 Single-Cell cellxgene, ARCHS4 scRNA-seq data portals
24 Toxicology CTD, EPA CompTox Chemical-gene-disease

Section C: Restricted/Uncovered Data Sources

These sources require registration or have no ToolUniverse tool. For each, the table shows access requirements and how to get data programmatically once credentialed.

Note: ToolUniverse has 2300+ tools — use find_tools("your topic") to discover tools not listed above. Section B covers the most common API patterns; many more databases use the same patterns (e.g., all EBI databases follow the EBI REST pattern in #2).

Source Access Wait Time Format Contents
UK Biobank Restricted (institutional) 2-6 months CSV/Bulk 500K participants, genetics + imaging + health records
dbGaP Controlled (PI application) 1-3 months SRA/VCF/phenotype GWAS genotypes + phenotypes from thousands of studies
MIMIC-IV Credentialed (PhysioNet) 1-2 weeks CSV/Parquet ICU clinical data, 300K+ admissions
ICPSR Registration 1-3 days Stata/CSV Social/health science archives (10K+ studies)
HRS Registration 1-3 days Stata Health & Retirement Study, 20K+ older Americans, biennial
ELSA Registration 1-3 days Stata/SPSS English Longitudinal Study of Ageing
SHARE Registration 1-2 weeks Stata Survey of Health, Ageing, Retirement in Europe (28 countries)
Materials Project Free API key Instant JSON 150K+ computed materials properties
Human Cell Atlas Open Instant h5ad/loom Single-cell atlas across human tissues
ADNI Application 1-2 months DICOM/CSV Alzheimer's neuroimaging + biomarkers + cognition
OpenNeuro Open Instant NIfTI/BIDS 800+ neuroimaging datasets
CIBERSORTx Free registration Instant GCT/TSV Cell type deconvolution from bulk expression
FlowRepository Open Instant FCS Flow cytometry experiments
SynBioHub Open Instant SBOL/GenBank Synthetic biology parts and designs

For restricted sources: search literature (PubMed) for published analyses using that dataset. Papers cite their data source and often deposit derived data in public repositories (GEO, SRA, Zenodo).


Section D: Universal Patterns

Pagination

# Pattern 1: offset + limit (most REST APIs)
all_records = []
offset = 0
while True:
    resp = requests.get(f"{api_url}?offset={offset}&limit=500", timeout=30).json()
    batch = resp.get("data", resp.get("results", resp.get("hits", [])))
    if not batch: break
    all_records.extend(batch)
    offset += len(batch)

# Pattern 2: cursor/token (EuropePMC, ClinicalTrials.gov, UniProt)
token = None
while True:
    params = {"pageSize": 100}
    if token: params["pageToken"] = token
    resp = requests.get(api_url, params=params).json()
    all_records.extend(resp["results"])
    token = resp.get("nextPageToken")
    if not token: break

Rate Limiting & Retries

import time
def fetch_with_retry(url, max_retries=3, **kwargs):
    for attempt in range(max_retries):
        resp = requests.get(url, timeout=30, **kwargs)
        if resp.status_code == 200: return resp
        if resp.status_code == 429:  # rate limited
            wait = int(resp.headers.get("Retry-After", 2 ** attempt))
            time.sleep(wait)
        else:
            time.sleep(2 ** attempt)
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

Authentication

import os
# API key in header (most common)
headers = {"Authorization": f"Bearer {os.environ.get('API_KEY', '')}"}
# API key as query param
params = {"api_key": os.environ.get("API_KEY", "")}
# No auth needed for most scientific APIs (NCBI, EBI, PubChem, GDC, CDC)

Bulk Download with Streaming

def download_large_file(url, output_path):
    with requests.get(url, stream=True, timeout=300) as r:
        r.raise_for_status()
        with open(output_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

Error Handling

resp = requests.get(url, timeout=30)
if resp.status_code != 200:
    raise ValueError(f"HTTP {resp.status_code}: {resp.text[:200]}")
# Guard against HTML error pages (CDC, NCBI return 200 with HTML for missing files)
if resp.content[:5] in (b"<!DOC", b"<html"):
    raise ValueError(f"Server returned HTML error page for {url}")
data = resp.json()  # raises JSONDecodeError if not valid JSON
Weekly Installs
49
GitHub Stars
1.3K
First Seen
Today