skills/gptomics/bioskills/bio-workflow-management-wdl-workflows

bio-workflow-management-wdl-workflows

SKILL.md

WDL Workflows

Basic Task Definition

version 1.0

task fastqc {
    input {
        File fastq
        Int threads = 2
    }

    command <<<
        fastqc -t ~{threads} ~{fastq}
    >>>

    output {
        File html = glob("*_fastqc.html")[0]
        File zip = glob("*_fastqc.zip")[0]
    }

    runtime {
        docker: "biocontainers/fastqc:v0.11.9"
        cpu: threads
        memory: "4 GB"
    }
}

Simple Workflow

version 1.0

workflow rnaseq {
    input {
        File fastq_1
        File fastq_2
        File salmon_index
    }

    call fastp {
        input:
            reads_1 = fastq_1,
            reads_2 = fastq_2
    }

    call salmon_quant {
        input:
            reads_1 = fastp.trimmed_1,
            reads_2 = fastp.trimmed_2,
            index = salmon_index
    }

    output {
        File quant_sf = salmon_quant.quant_file
    }
}

Task with All Sections

version 1.0

task bwa_mem {
    input {
        File reference
        File reference_index
        File reads_1
        File reads_2
        String sample_id
        Int threads = 8
    }

    Int disk_size = ceil(size(reference, "GB") + size(reads_1, "GB") * 3) + 20

    command <<<
        bwa mem -t ~{threads} -R "@RG\tID:~{sample_id}\tSM:~{sample_id}" \
            ~{reference} ~{reads_1} ~{reads_2} | \
            samtools sort -@ ~{threads} -o ~{sample_id}.sorted.bam
        samtools index ~{sample_id}.sorted.bam
    >>>

    output {
        File bam = "~{sample_id}.sorted.bam"
        File bai = "~{sample_id}.sorted.bam.bai"
    }

    runtime {
        docker: "biocontainers/bwa:v0.7.17"
        cpu: threads
        memory: "16 GB"
        disks: "local-disk " + disk_size + " HDD"
    }
}

Scatter (Parallel Execution)

version 1.0

workflow process_samples {
    input {
        Array[File] fastq_files
        File reference
    }

    scatter (fastq in fastq_files) {
        call align {
            input:
                fastq = fastq,
                reference = reference
        }
    }

    output {
        Array[File] bam_files = align.bam
    }
}

Scatter with Paired Files

version 1.0

struct SampleFastqs {
    String sample_id
    File fastq_1
    File fastq_2
}

workflow paired_alignment {
    input {
        Array[SampleFastqs] samples
        File reference
    }

    scatter (sample in samples) {
        call align {
            input:
                sample_id = sample.sample_id,
                reads_1 = sample.fastq_1,
                reads_2 = sample.fastq_2,
                reference = reference
        }
    }

    output {
        Array[File] bams = align.bam
    }
}

Conditional Execution

version 1.0

workflow conditional_qc {
    input {
        File fastq
        Boolean run_qc = true
    }

    if (run_qc) {
        call fastqc {
            input:
                fastq = fastq
        }
    }

    output {
        File? qc_report = fastqc.html
    }
}

Structs and Complex Types

version 1.0

struct ReferenceData {
    File fasta
    File fasta_index
    File dict
    File? known_sites
}

workflow variant_calling {
    input {
        ReferenceData reference
        Array[File] bam_files
    }

    scatter (bam in bam_files) {
        call haplotype_caller {
            input:
                bam = bam,
                ref_fasta = reference.fasta,
                ref_index = reference.fasta_index,
                ref_dict = reference.dict
        }
    }
}

Input JSON

{
    "rnaseq.fastq_1": "data/sample1_R1.fq.gz",
    "rnaseq.fastq_2": "data/sample1_R2.fq.gz",
    "rnaseq.salmon_index": "ref/salmon_index",
    "rnaseq.threads": 8
}

Array Inputs JSON

{
    "process_samples.samples": [
        {
            "sample_id": "sample1",
            "fastq_1": "data/sample1_R1.fq.gz",
            "fastq_2": "data/sample1_R2.fq.gz"
        },
        {
            "sample_id": "sample2",
            "fastq_1": "data/sample2_R1.fq.gz",
            "fastq_2": "data/sample2_R2.fq.gz"
        }
    ],
    "process_samples.reference": "ref/genome.fa"
}

Subworkflows

version 1.0

import "qc.wdl" as qc
import "align.wdl" as align

workflow main_pipeline {
    input {
        File fastq_1
        File fastq_2
        File reference
    }

    call qc.quality_control {
        input:
            reads_1 = fastq_1,
            reads_2 = fastq_2
    }

    call align.alignment {
        input:
            reads_1 = quality_control.trimmed_1,
            reads_2 = quality_control.trimmed_2,
            reference = reference
    }
}

Runtime Options

runtime {
    docker: "ubuntu:20.04"
    cpu: 4
    memory: "8 GB"
    disks: "local-disk 100 HDD"
    preemptible: 3
    maxRetries: 2
    zones: "us-central1-a us-central1-b"
    bootDiskSizeGb: 15
}

String Interpolation and Expressions

version 1.0

task process {
    input {
        String sample_id
        Int memory_gb = 8
        Array[File] input_files
    }

    Int memory_mb = memory_gb * 1000
    String output_name = sample_id + ".processed.bam"

    command <<<
        # Access array elements
        process_tool \
            --memory ~{memory_mb} \
            --inputs ~{sep=' ' input_files} \
            --output ~{output_name}
    >>>

    output {
        File result = output_name
    }
}

File Size and Disk Calculation

version 1.0

task align {
    input {
        File reads_1
        File reads_2
        File reference
    }

    # Calculate disk: input files + 3x for outputs + buffer
    Int disk_gb = ceil(size(reads_1, "GB") + size(reads_2, "GB") +
                       size(reference, "GB") * 2) + 50

    command <<<
        bwa mem ~{reference} ~{reads_1} ~{reads_2} > aligned.sam
    >>>

    runtime {
        disks: "local-disk " + disk_gb + " SSD"
    }
}

Complete RNA-seq Workflow

version 1.0

workflow rnaseq_pipeline {
    input {
        Array[String] sample_ids
        Array[File] fastq_1_files
        Array[File] fastq_2_files
        File salmon_index
        Int threads = 8
    }

    scatter (idx in range(length(sample_ids))) {
        call fastp {
            input:
                sample_id = sample_ids[idx],
                reads_1 = fastq_1_files[idx],
                reads_2 = fastq_2_files[idx],
                threads = threads
        }

        call salmon_quant {
            input:
                sample_id = sample_ids[idx],
                reads_1 = fastp.trimmed_1,
                reads_2 = fastp.trimmed_2,
                index = salmon_index,
                threads = threads
        }
    }

    output {
        Array[File] quant_files = salmon_quant.quant_sf
        Array[File] fastp_reports = fastp.json_report
    }
}

task fastp {
    input {
        String sample_id
        File reads_1
        File reads_2
        Int threads = 4
    }

    command <<<
        fastp -i ~{reads_1} -I ~{reads_2} \
            -o ~{sample_id}_trimmed_R1.fq.gz \
            -O ~{sample_id}_trimmed_R2.fq.gz \
            --json ~{sample_id}_fastp.json \
            --thread ~{threads}
    >>>

    output {
        File trimmed_1 = "~{sample_id}_trimmed_R1.fq.gz"
        File trimmed_2 = "~{sample_id}_trimmed_R2.fq.gz"
        File json_report = "~{sample_id}_fastp.json"
    }

    runtime {
        docker: "quay.io/biocontainers/fastp:0.23.4--hadf994f_2"
        cpu: threads
        memory: "4 GB"
    }
}

task salmon_quant {
    input {
        String sample_id
        File reads_1
        File reads_2
        File index
        Int threads = 8
    }

    command <<<
        salmon quant -i ~{index} -l A \
            -1 ~{reads_1} -2 ~{reads_2} \
            -o ~{sample_id}_salmon \
            --threads ~{threads} --validateMappings
    >>>

    output {
        File quant_sf = "~{sample_id}_salmon/quant.sf"
        File quant_dir = "~{sample_id}_salmon"
    }

    runtime {
        docker: "quay.io/biocontainers/salmon:1.10.0--h7e5ed60_0"
        cpu: threads
        memory: "16 GB"
    }
}

Run Commands

# Validate WDL syntax
womtool validate workflow.wdl

# Generate inputs template
womtool inputs workflow.wdl > inputs.json

# Run with Cromwell (local)
java -jar cromwell.jar run workflow.wdl -i inputs.json

# Run with miniwdl (simpler local runner)
miniwdl run workflow.wdl -i inputs.json

# Run on Terra
# Upload WDL and inputs.json to Terra workspace

Execution Engines

Engine Use Case
Cromwell Full-featured, Google Cloud, AWS, HPC
miniwdl Lightweight local execution
Terra Cloud platform with Cromwell backend
AnVIL NIH cloud platform (Terra-based)
dxWDL DNAnexus platform

Related Skills

  • workflow-management/cwl-workflows - CWL alternative
  • workflow-management/snakemake-workflows - Python-based alternative
  • workflow-management/nextflow-pipelines - Groovy-based alternative
Weekly Installs
3
GitHub Stars
339
First Seen
Jan 24, 2026
Installed on
trae2
windsurf1
opencode1
codex1
claude-code1
antigravity1