bio-batch-processing
SKILL.md
Batch Processing
Process multiple sequence files efficiently using Biopython.
Required Imports
from pathlib import Path
from Bio import SeqIO
Process Multiple Files
Iterate Over Files in Directory
from pathlib import Path
for fasta_file in Path('data/').glob('*.fasta'):
records = list(SeqIO.parse(fasta_file, 'fasta'))
print(f'{fasta_file.name}: {len(records)} sequences')
Process All FASTQ Files
for fq_file in Path('.').glob('*.fastq'):
count = sum(1 for _ in SeqIO.parse(fq_file, 'fastq'))
print(f'{fq_file.name}: {count} reads')
Recursive File Search
for gb_file in Path('data/').rglob('*.gb'):
print(f'Found: {gb_file}')
Merge Files
Merge All FASTA Files
from pathlib import Path
def all_records(directory, pattern, format):
for filepath in Path(directory).glob(pattern):
yield from SeqIO.parse(filepath, format)
records = all_records('data/', '*.fasta', 'fasta')
count = SeqIO.write(records, 'merged.fasta', 'fasta')
print(f'Merged {count} records')
Merge with Source Tracking
def records_with_source(directory, pattern, format):
for filepath in Path(directory).glob(pattern):
for record in SeqIO.parse(filepath, format):
record.description = f'{record.description} [source={filepath.name}]'
yield record
records = records_with_source('data/', '*.fasta', 'fasta')
SeqIO.write(records, 'merged_tracked.fasta', 'fasta')
Merge Specific Files
files = ['sample1.fasta', 'sample2.fasta', 'sample3.fasta']
def merge_files(file_list, format):
for filepath in file_list:
yield from SeqIO.parse(filepath, format)
SeqIO.write(merge_files(files, 'fasta'), 'combined.fasta', 'fasta')
Split Files
Split by Number of Records
from itertools import islice
def split_file(input_file, format, records_per_file, output_prefix):
records = SeqIO.parse(input_file, format)
file_num = 1
while True:
batch = list(islice(records, records_per_file))
if not batch:
break
output_file = f'{output_prefix}_{file_num}.{format}'
SeqIO.write(batch, output_file, format)
print(f'Wrote {len(batch)} records to {output_file}')
file_num += 1
split_file('large.fasta', 'fasta', 1000, 'split')
Split by Sequence ID Prefix
from collections import defaultdict
records_by_prefix = defaultdict(list)
for record in SeqIO.parse('input.fasta', 'fasta'):
prefix = record.id.split('_')[0]
records_by_prefix[prefix].append(record)
for prefix, records in records_by_prefix.items():
SeqIO.write(records, f'{prefix}.fasta', 'fasta')
One Sequence Per File
for record in SeqIO.parse('multi.fasta', 'fasta'):
SeqIO.write(record, f'{record.id}.fasta', 'fasta')
Batch Convert
Convert All Files in Directory
from pathlib import Path
for gb_file in Path('genbank/').glob('*.gb'):
fasta_file = Path('fasta/') / gb_file.with_suffix('.fasta').name
count = SeqIO.convert(str(gb_file), 'genbank', str(fasta_file), 'fasta')
print(f'{gb_file.name} -> {fasta_file.name}: {count} records')
Batch Convert with Summary
from pathlib import Path
results = []
for input_file in Path('input/').glob('*.gb'):
output_file = Path('output/') / input_file.with_suffix('.fasta').name
count = SeqIO.convert(str(input_file), 'genbank', str(output_file), 'fasta')
results.append({'file': input_file.name, 'records': count})
print(f'Converted {len(results)} files, {sum(r["records"] for r in results)} total records')
Parallel Processing
Using multiprocessing
from multiprocessing import Pool
from pathlib import Path
def process_file(filepath):
records = list(SeqIO.parse(filepath, 'fasta'))
return {'file': filepath.name, 'count': len(records), 'total_bp': sum(len(r.seq) for r in records)}
files = list(Path('data/').glob('*.fasta'))
with Pool(4) as pool:
results = pool.map(process_file, files)
for r in results:
print(f'{r["file"]}: {r["count"]} seqs, {r["total_bp"]} bp')
Using concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
def count_records(filepath):
return filepath.name, sum(1 for _ in SeqIO.parse(filepath, 'fasta'))
files = list(Path('data/').glob('*.fasta'))
with ThreadPoolExecutor(max_workers=4) as executor:
results = executor.map(count_records, files)
for name, count in results:
print(f'{name}: {count}')
Summary Statistics
Aggregate Stats Across Files
from pathlib import Path
total_seqs = 0
total_bp = 0
file_count = 0
for fasta_file in Path('data/').glob('*.fasta'):
for record in SeqIO.parse(fasta_file, 'fasta'):
total_seqs += 1
total_bp += len(record.seq)
file_count += 1
print(f'Files: {file_count}')
print(f'Sequences: {total_seqs}')
print(f'Total bp: {total_bp}')
print(f'Average length: {total_bp / total_seqs:.0f}')
Per-File Summary Report
from pathlib import Path
import csv
summaries = []
for fasta_file in Path('data/').glob('*.fasta'):
records = list(SeqIO.parse(fasta_file, 'fasta'))
lengths = [len(r.seq) for r in records]
summaries.append({
'file': fasta_file.name,
'sequences': len(records),
'total_bp': sum(lengths),
'min_len': min(lengths) if lengths else 0,
'max_len': max(lengths) if lengths else 0,
'avg_len': sum(lengths) / len(lengths) if lengths else 0
})
with open('summary.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=summaries[0].keys())
writer.writeheader()
writer.writerows(summaries)
File Organization
Organize by Criteria
from pathlib import Path
from Bio.SeqUtils import gc_fraction
Path('high_gc').mkdir(exist_ok=True)
Path('low_gc').mkdir(exist_ok=True)
for fasta_file in Path('input/').glob('*.fasta'):
records = list(SeqIO.parse(fasta_file, 'fasta'))
avg_gc = sum(gc_fraction(r.seq) for r in records) / len(records)
if avg_gc >= 0.5:
dest = Path('high_gc') / fasta_file.name
else:
dest = Path('low_gc') / fasta_file.name
SeqIO.write(records, dest, 'fasta')
Common Patterns
| Task | Approach |
|---|---|
| Merge files | Generator yielding from each file |
| Split file | islice with batch size |
| Convert all | Loop with SeqIO.convert |
| Parallel processing | multiprocessing.Pool or ThreadPoolExecutor |
| Summary stats | Accumulate while iterating |
Related Skills
- read-sequences - Core parsing functions for each file
- write-sequences - Write processed outputs
- sequence-statistics - Generate per-file statistics
- format-conversion - Batch format conversion
- compressed-files - Handle compressed files in batch
- database-access - Batch download sequences from NCBI
Weekly Installs
3
Repository
gptomics/bioskillsGitHub Stars
339
First Seen
Jan 24, 2026
Security Audits
Installed on
trae2
windsurf1
opencode1
codex1
claude-code1
antigravity1