Data Wrangling: Universal Access Patterns

Reference for downloading and parsing scientific data from any source. Write and run Python code via Bash for every step.

When to Use

ToolUniverse tool returned metadata/search results but you need raw or bulk data
Data is in a format tools don't parse (VCF, h5ad, BAM, SDF, GCT)
You need a multi-step API workflow (search -> filter -> download -> parse)
The data source has no ToolUniverse tool at all
You need thousands of records, not the 10-100 a tool returns

Decision: Tool vs Code

Situation	Use
Single record lookup, simple search, <100 results	ToolUniverse tool (`execute_tool`)
Bulk download, custom filtering, format conversion	Write Python code
Tool exists but returns truncated results	Write code using the same API the tool wraps
No tool exists for this source	Write code directly

Section A: Format Cookbook

Tabular

import pandas as pd, io

df = pd.read_csv("data.csv")                                # CSV
df = pd.read_csv("data.tsv", sep="\t")                      # TSV
df = pd.read_sas(io.BytesIO(content), format="xport")       # SAS Transport (XPT) — NHANES, CDC
df = pd.read_sas("data.sas7bdat", format="sas7bdat")        # SAS native
df = pd.read_stata("data.dta")                               # Stata — ICPSR, HRS
df = pd.read_parquet("data.parquet")                         # Parquet — MIMIC-IV
df = pd.read_excel("data.xlsx")                              # Excel
df = pd.read_spss("data.sav")                                # SPSS
df = pd.read_fwf("data.dat")                                 # Fixed-width — legacy surveys

Genomics

from Bio import SeqIO
records = list(SeqIO.parse("seqs.fasta", "fasta"))           # FASTA
records = list(SeqIO.parse("reads.fastq", "fastq"))          # FASTQ

# VCF (no cyvcf2 needed)
vcf_lines = [l for l in open("vars.vcf") if not l.startswith("##")]
df = pd.read_csv(io.StringIO("".join(vcf_lines)), sep="\t")

df = pd.read_csv("genes.gff3", sep="\t", comment="#",        # GFF/GTF
     names=["seqid","source","type","start","end","score","strand","phase","attrs"])
df = pd.read_csv("regions.bed", sep="\t", header=None,       # BED
     names=["chrom","start","end","name","score","strand"])

import pysam                                                  # BAM (requires pysam)
bam = pysam.AlignmentFile("aligned.bam", "rb")
for read in bam.fetch("chr1", 1000, 2000): print(read.query_name)

Structural

from Bio.PDB import PDBParser, MMCIFParser
parser = PDBParser(QUIET=True)
structure = parser.get_structure("prot", "structure.pdb")     # PDB

parser = MMCIFParser(QUIET=True)
structure = parser.get_structure("prot", "structure.cif")     # mmCIF

from rdkit import Chem                                        # SDF/MOL (requires rdkit)
supplier = Chem.SDMolSupplier("compounds.sdf")
mols = [m for m in supplier if m is not None]

Omics Matrices

import anndata
adata = anndata.read_h5ad("expression.h5ad")                 # AnnData (scRNA-seq, spatial)

import scipy.io
mat = scipy.io.mmread("matrix.mtx")                          # 10X Genomics MTX
barcodes = pd.read_csv("barcodes.tsv", header=None)[0].tolist()
features = pd.read_csv("features.tsv", sep="\t", header=None)[1].tolist()

df = pd.read_csv("expression.gct", sep="\t", skiprows=2)     # GCT (gene expression)

import loompy                                                 # Loom (legacy single-cell)
ds = loompy.connect("data.loom")

Mass Spectrometry & Flow Cytometry

from pyteomics import mzml                                    # mzML (proteomics, requires pyteomics)
spectra = list(mzml.read("spectra.mzML"))

import fcsparser                                              # FCS (flow cytometry, requires fcsparser)
meta, data = fcsparser.parse("sample.fcs", reformat_meta=True)

Neuroimaging

import nibabel as nib                                         # NIfTI (requires nibabel)
img = nib.load("brain.nii.gz")
data = img.get_fdata()  # 3D/4D numpy array

# DICOM (requires pydicom)
import pydicom
dcm = pydicom.dcmread("scan.dcm")
pixel_data = dcm.pixel_array

Phylogenetics & Systems Biology

from Bio import Phylo                                         # Newick/Nexus (BioPython)
tree = Phylo.read("tree.nwk", "newick")
tree = Phylo.read("tree.nex", "nexus")

import libsbml                                                # SBML (systems biology, requires python-libsbml)
reader = libsbml.SBMLReader()
doc = reader.readSBML("model.xml")
model = doc.getModel()

Serialized

import json, xml.etree.ElementTree as ET, h5py

data = json.load(open("data.json"))                           # JSON
df = pd.read_json("records.json")                             # JSON -> DataFrame
tree = ET.parse("data.xml"); root = tree.getroot()            # XML
f = h5py.File("data.h5", "r"); dataset = f["group/data"][:]   # HDF5

Compressed

df = pd.read_csv("data.csv.gz")                              # gzip (pandas auto-detects)
df = pd.read_csv("data.tsv.gz", sep="\t")                    # gzip TSV

import zipfile
with zipfile.ZipFile(io.BytesIO(content)) as z:               # ZIP
    df = pd.read_csv(z.open(z.namelist()[0]))

import tarfile
with tarfile.open("archive.tar.gz") as t:                     # tar.gz
    f = t.extractfile(t.getnames()[0])
    df = pd.read_csv(f)

Section B: API Patterns by Domain

Each category shows: which ToolUniverse tools exist, and how to go beyond them with direct API calls.

1. NCBI E-utilities (Gene, Nucleotide, Protein, SRA, GEO)

Tools: NCBI_search_gene, NCBI_search_nucleotide, SRA_search_runs, GEO_search_datasets

import requests
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# Search -> get IDs -> fetch records in batches
ids = requests.get(f"{base}/esearch.fcgi?db=gene&term=BRCA1+AND+human&retmax=500&retmode=json").json()
id_list = ids["esearchresult"]["idlist"]
# Fetch in batches of 500
for i in range(0, len(id_list), 500):
    batch = ",".join(id_list[i:i+500])
    data = requests.get(f"{base}/efetch.fcgi?db=gene&id={batch}&retmode=xml").text

2. EBI APIs (UniProt, PDBe, ChEMBL, Ensembl, InterPro)

Tools: UniProt_search, PDBe_*, ChEMBL_*, Ensembl_*, InterPro_*

# UniProt bulk TSV download with cursor pagination
url = "https://rest.uniprot.org/uniprotkb/search?query=organism_id:9606+AND+keyword:kinase&format=tsv&size=500"
all_rows = []
while url:
    resp = requests.get(url)
    all_rows.append(resp.text)
    url = resp.headers.get("Link", "").split(";")[0].strip("<>") if "Link" in resp.headers else None

3. NCI GDC (TCGA/TARGET Cancer Data)

Tools: GDC_search_cases, GDC_list_files, GDC_get_clinical_data

# Bulk clinical data with filters
filters = {"op":"and","content":[
    {"op":"=","content":{"field":"project.project_id","value":"TCGA-BRCA"}},
    {"op":"=","content":{"field":"demographic.vital_status","value":"Dead"}}
]}
cases = requests.post("https://api.gdc.cancer.gov/cases", json={
    "filters": filters, "fields": "demographic.vital_status,diagnoses.days_to_death",
    "size": 1000, "from": 0
}).json()["data"]["hits"]

4. CDC Health Surveys (NHANES, BRFSS, WONDER)

Tools: NHANES_download_and_parse, cdc_data_search_datasets

# Direct NHANES XPT download (any cycle, any component)
cycle, component = "2017-2018", "DEMO_J"
url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/{component}.XPT"
df = pd.read_sas(io.BytesIO(requests.get(url).content), format="xport")

5. GWAS & Genetics (GWAS Catalog, gnomAD, ClinVar)

Tools: gwas_search_associations, gnomAD_*, ClinVar_*

# GWAS Catalog full download (37MB TSV, all associations)
url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
df = pd.read_csv(url, sep="\t")
# Filter locally
hits = df[df["DISEASE/TRAIT"].str.contains("diabetes", case=False, na=False)]

6. Chemical (PubChem, ChEMBL, KEGG)

Tools: PubChem_*, ChEMBL_*, KEGG_*

# PubChem batch property retrieval (up to 100 CIDs at once)
cids = "2244,5988,3672"  # aspirin, sucrose, ibuprofen
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cids}/property/MolecularWeight,XLogP,TPSA/JSON"
props = requests.get(url).json()["PropertyTable"]["Properties"]

7. Expression (GEO, ArrayExpress, GTEx)

Tools: GEO_search_datasets, ArrayExpress_search

# GEO series matrix direct download
geo_id = "GSE12345"
url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{geo_id[:5]}nnn/{geo_id}/matrix/{geo_id}_series_matrix.txt.gz"
df = pd.read_csv(url, sep="\t", comment="!", index_col=0)

# GTEx bulk expression (median TPM per tissue)
url = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
df = pd.read_csv(url, sep="\t", skiprows=2)

8. Clinical (ClinicalTrials.gov, FDA/OpenFDA, FAERS)

Tools: search_clinical_trials, OpenFDA_*

# ClinicalTrials.gov v2 API with pagination
all_studies = []
token = None
while True:
    params = {"query.cond": "lung cancer", "query.intr": "immunotherapy", "pageSize": 100}
    if token: params["pageToken"] = token
    resp = requests.get("https://clinicaltrials.gov/api/v2/studies", params=params).json()
    all_studies.extend(resp.get("studies", []))
    token = resp.get("nextPageToken")
    if not token: break

9. Literature (PubMed, PMC, EuropePMC)

Tools: PubMed_search_articles, EuropePMC_search_articles

# EuropePMC full-text search with cursor
cursor = "*"
all_results = []
while cursor:
    resp = requests.get("https://www.ebi.ac.uk/europepmc/webservices/rest/search",
        params={"query": "BRCA1 AND resistance", "format": "json", "pageSize": 100, "cursorMark": cursor}).json()
    all_results.extend(resp.get("resultList", {}).get("result", []))
    cursor = resp.get("nextCursorMark") if len(all_results) < resp.get("hitCount", 0) else None

10. Data Repositories (Zenodo, Figshare, Dryad, DataCite)

Tools: DataCite_search_dois, Zenodo_search_records, Dryad_search_datasets

# Zenodo: search + download files
record = requests.get("https://zenodo.org/api/records", params={"q": "proteomics cancer", "size": 5}).json()["hits"]["hits"][0]
for f in record["files"]:
    content = requests.get(f["links"]["self"]).content  # download each file

11-24. Specialized Domains

For these 14 additional domains, read references/specialized-domains.md when you need the specific API pattern:

#	Domain	Key APIs/Tools	When to Read
11	Proteomics	PRIDE, MassIVE, ProteomeXchange	Mass spec data download
12	Metabolomics	MetaboLights, Metabolomics Workbench, HMDB	Metabolite/spectra data
13	Microbiome	MGnify, GMREPO	Metagenome profiles
14	Ecology	GBIF, iNaturalist, OBIS	Species occurrence data
15	Model Organisms	FlyBase, WormBase, ZFIN, RGD	Gene data for non-human species
16	Pathways & Networks	Reactome, STRING, BioGRID	Network/pathway export
17	Ontologies	OLS, GO, HPO	Term hierarchy traversal
18	Immunology	IEDB, VDJdb, ImmPort	Epitope/receptor data
19	Drug & Pharma	PharmGKB, DGIdb, SIDER	Drug-gene interactions
20	Imaging & Atlases	TCIA, HPA, Allen Brain Atlas	Imaging collections
21	Protein Structure	RCSB PDB, AlphaFold	PDB/CIF file download
22	Clinical Genomics	ClinVar, ClinGen, CIViC	Variant interpretation bulk
23	Single-Cell	cellxgene, ARCHS4	scRNA-seq data portals
24	Toxicology	CTD, EPA CompTox	Chemical-gene-disease

Section C: Restricted/Uncovered Data Sources

These sources require registration or have no ToolUniverse tool. For each, the table shows access requirements and how to get data programmatically once credentialed.

Note: ToolUniverse has 2300+ tools — use find_tools("your topic") to discover tools not listed above. Section B covers the most common API patterns; many more databases use the same patterns (e.g., all EBI databases follow the EBI REST pattern in #2).

Source	Access	Wait Time	Format	Contents
UK Biobank	Restricted (institutional)	2-6 months	CSV/Bulk	500K participants, genetics + imaging + health records
dbGaP	Controlled (PI application)	1-3 months	SRA/VCF/phenotype	GWAS genotypes + phenotypes from thousands of studies
MIMIC-IV	Credentialed (PhysioNet)	1-2 weeks	CSV/Parquet	ICU clinical data, 300K+ admissions
ICPSR	Registration	1-3 days	Stata/CSV	Social/health science archives (10K+ studies)
HRS	Registration	1-3 days	Stata	Health & Retirement Study, 20K+ older Americans, biennial
ELSA	Registration	1-3 days	Stata/SPSS	English Longitudinal Study of Ageing
SHARE	Registration	1-2 weeks	Stata	Survey of Health, Ageing, Retirement in Europe (28 countries)
Materials Project	Free API key	Instant	JSON	150K+ computed materials properties
Human Cell Atlas	Open	Instant	h5ad/loom	Single-cell atlas across human tissues
ADNI	Application	1-2 months	DICOM/CSV	Alzheimer's neuroimaging + biomarkers + cognition
OpenNeuro	Open	Instant	NIfTI/BIDS	800+ neuroimaging datasets
CIBERSORTx	Free registration	Instant	GCT/TSV	Cell type deconvolution from bulk expression
FlowRepository	Open	Instant	FCS	Flow cytometry experiments
SynBioHub	Open	Instant	SBOL/GenBank	Synthetic biology parts and designs

For restricted sources: search literature (PubMed) for published analyses using that dataset. Papers cite their data source and often deposit derived data in public repositories (GEO, SRA, Zenodo).

Section D: Universal Patterns

Pagination

# Pattern 1: offset + limit (most REST APIs)
all_records = []
offset = 0
while True:
    resp = requests.get(f"{api_url}?offset={offset}&limit=500", timeout=30).json()
    batch = resp.get("data", resp.get("results", resp.get("hits", [])))
    if not batch: break
    all_records.extend(batch)
    offset += len(batch)

# Pattern 2: cursor/token (EuropePMC, ClinicalTrials.gov, UniProt)
token = None
while True:
    params = {"pageSize": 100}
    if token: params["pageToken"] = token
    resp = requests.get(api_url, params=params).json()
    all_records.extend(resp["results"])
    token = resp.get("nextPageToken")
    if not token: break

Rate Limiting & Retries

import time
def fetch_with_retry(url, max_retries=3, **kwargs):
    for attempt in range(max_retries):
        resp = requests.get(url, timeout=30, **kwargs)
        if resp.status_code == 200: return resp
        if resp.status_code == 429:  # rate limited
            wait = int(resp.headers.get("Retry-After", 2 ** attempt))
            time.sleep(wait)
        else:
            time.sleep(2 ** attempt)
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

Authentication

import os
# API key in header (most common)
headers = {"Authorization": f"Bearer {os.environ.get('API_KEY', '')}"}
# API key as query param
params = {"api_key": os.environ.get("API_KEY", "")}
# No auth needed for most scientific APIs (NCBI, EBI, PubChem, GDC, CDC)

Bulk Download with Streaming

def download_large_file(url, output_path):
    with requests.get(url, stream=True, timeout=300) as r:
        r.raise_for_status()
        with open(output_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

Error Handling

resp = requests.get(url, timeout=30)
if resp.status_code != 200:
    raise ValueError(f"HTTP {resp.status_code}: {resp.text[:200]}")
# Guard against HTML error pages (CDC, NCBI return 200 with HTML for missing files)
if resp.content[:5] in (b"<!DOC", b"<html"):
    raise ValueError(f"Server returned HTML error page for {url}")
data = resp.json()  # raises JSONDecodeError if not valid JSON

tooluniverse-data-wrangling