python-analytics
SKILL.md
Python Analytics Skill
Overview
Comprehensive Python data analysis patterns for healthcare analytics, medical imaging informatics, and operational intelligence. Covers pandas/numpy workflows, statistical testing, visualization, Jupyter best practices, and automated reporting.
Type
domain-expertise
When to Use
Trigger this skill when:
- Data manipulation with pandas/numpy
- Statistical analysis or hypothesis testing
- Data visualization (matplotlib, seaborn, plotly)
- Jupyter notebook development
- Healthcare/PACS analytics
- Automated report generation
- Data cleaning workflows
Keywords: pandas, numpy, scipy, matplotlib, seaborn, plotly, jupyter, notebook, EDA, statistics, visualization, PACS, DICOM, healthcare analytics
1. Data Manipulation & Cleaning (Pandas/NumPy)
Core Patterns
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# Load with date parsing
df = pd.read_csv('data.csv', parse_dates=['StudyDate', 'SeriesDate'])
# Clean and standardize
df['Modality'] = df['Modality'].str.upper().str.strip()
df['Description'] = df['Description'].str.strip().fillna('UNKNOWN')
# Handle missing values
df['PatientAge'] = df['PatientAge'].fillna(df['PatientAge'].median())
df['PatientAge'] = df.groupby('Modality')['PatientAge'].transform(
lambda x: x.fillna(x.median())
)
# Create derived features
df['StudyHour'] = df['StudyDate'].dt.hour
df['DayOfWeek'] = df['StudyDate'].dt.day_name()
# Remove duplicates
df = df.drop_duplicates(subset=['StudyInstanceUID'], keep='first')
# Aggregations
summary = df.groupby(['Modality', df['StudyDate'].dt.year]).agg({
'StudyCount': 'sum',
'SizeGB': ['sum', 'mean']
}).reset_index()
DICOM Metadata Cleaning
def clean_dicom_metadata(df):
"""Comprehensive DICOM metadata cleaning."""
df = df.copy()
# Standardize dates
date_cols = ['StudyDate', 'SeriesDate', 'AcquisitionDate']
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
# Standardize modality codes
modality_mapping = {
'CR': 'CR', 'DX': 'CR',
'CT': 'CT', 'MR': 'MR', 'MRI': 'MR',
'US': 'US', 'ULTRASOUND': 'US',
'NM': 'NM', 'PT': 'PT',
'MG': 'MG', 'MAMMO': 'MG'
}
df['Modality'] = df['Modality'].str.upper().map(modality_mapping).fillna(df['Modality'])
# Parse age (handles '052Y', '52', etc.)
def parse_age(age_str):
if pd.isna(age_str):
return np.nan
age_str = str(age_str).strip()
if 'Y' in age_str:
return int(age_str.replace('Y', ''))
try:
return int(age_str)
except:
return np.nan
df['PatientAge'] = df['PatientAge'].apply(parse_age)
# Flag suspicious data
df['QualityFlag'] = 'OK'
df.loc[df['PatientAge'] > 120, 'QualityFlag'] = 'SUSPICIOUS_AGE'
df.loc[df['StudyDate'] < '1990-01-01', 'QualityFlag'] = 'SUSPICIOUS_DATE'
return df
Healthcare Workflow Metrics
# Turnaround time analysis
workflow['TAT_hours'] = (
workflow['ReportSignedTime'] - workflow['ExamCompleteTime']
).dt.total_seconds() / 3600
# Remove outliers (>99th percentile)
p99 = workflow['TAT_hours'].quantile(0.99)
workflow_clean = workflow[workflow['TAT_hours'] <= p99]
# Stratified summary
tat_summary = workflow_clean.groupby(['Priority', 'Modality']).agg({
'TAT_hours': ['mean', 'median', 'std', lambda x: x.quantile(0.95)]
}).round(2)
2. Statistical Analysis & Hypothesis Testing
A/B Testing for Workflow Changes
from scipy import stats
import numpy as np
def compare_interventions(baseline_df, intervention_df, metric='TAT_hours'):
"""
Compare workflow metrics before/after intervention.
H0: No difference | H1: Intervention reduced metric
"""
baseline = baseline_df[metric].dropna()
intervention = intervention_df[metric].dropna()
# Descriptive statistics
print(f"Baseline: mean={baseline.mean():.2f}, median={baseline.median():.2f}, n={len(baseline)}")
print(f"Intervention: mean={intervention.mean():.2f}, median={intervention.median():.2f}, n={len(intervention)}")
# Normality test
_, p_base = stats.shapiro(baseline.sample(min(5000, len(baseline))))
_, p_int = stats.shapiro(intervention.sample(min(5000, len(intervention))))
# Choose appropriate test
if p_base < 0.05 or p_int < 0.05:
stat, p = stats.mannwhitneyu(baseline, intervention, alternative='greater')
test_name = "Mann-Whitney U (non-parametric)"
else:
stat, p = stats.ttest_ind(baseline, intervention, alternative='greater')
test_name = "Independent t-test"
# Effect size (Cohen's d)
pooled_std = np.sqrt((baseline.std()**2 + intervention.std()**2) / 2)
cohens_d = (baseline.mean() - intervention.mean()) / pooled_std
print(f"\n{test_name}: stat={stat:.4f}, p={p:.4f}, Cohen's d={cohens_d:.3f}")
if p < 0.05:
improvement = baseline.mean() - intervention.mean()
pct = (improvement / baseline.mean()) * 100
print(f"Significant: {improvement:.2f} ({pct:.1f}%) improvement")
return {'p_value': p, 'cohens_d': cohens_d, 'significant': p < 0.05}
Trend Analysis
from scipy.stats import linregress
def analyze_trends(df, date_col='StudyDate', value_col='StudyCount'):
"""Analyze volume trends over time."""
monthly = df.groupby(pd.Grouper(key=date_col, freq='M'))[value_col].sum().reset_index()
monthly['MonthNum'] = range(len(monthly))
slope, intercept, r, p, _ = linregress(monthly['MonthNum'], monthly[value_col])
annual_growth = slope * 12
pct_growth = (annual_growth / monthly[value_col].mean()) * 100
print(f"Slope: {slope:.2f}/month, R2: {r**2:.3f}, p: {p:.4f}")
print(f"Annual: {annual_growth:.1f} ({pct_growth:+.1f}%)")
return monthly
Outlier Detection
def detect_outliers(df, columns, method='iqr', threshold=1.5):
"""Detect outliers using IQR or Z-score."""
outliers = pd.DataFrame()
for col in columns:
if method == 'iqr':
Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
IQR = Q3 - Q1
mask = (df[col] < Q1 - threshold * IQR) | (df[col] > Q3 + threshold * IQR)
elif method == 'zscore':
z = np.abs(stats.zscore(df[col].dropna()))
mask = z > threshold
outliers[f'{col}_outlier'] = mask
print(f"{col}: {mask.sum()} outliers ({mask.sum()/len(df)*100:.2f}%)")
return outliers
3. Data Visualization
Matplotlib/Seaborn Patterns
import matplotlib.pyplot as plt
import seaborn as sns
# Set publication defaults
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
def plot_modality_distribution(df, save_path=None):
"""Publication-quality modality distribution."""
counts = df['Modality'].value_counts().head(10)
colors = sns.color_palette("husl", len(counts))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Bar plot
counts.plot(kind='bar', ax=ax1, color=colors)
ax1.set_title('Top 10 Modalities', fontsize=14, fontweight='bold')
ax1.set_xlabel('Modality')
ax1.set_ylabel('Study Count')
ax1.tick_params(axis='x', rotation=45)
# Add value labels
for i, v in enumerate(counts):
ax1.text(i, v + counts.max() * 0.01, f'{v:,}', ha='center', fontsize=10)
# Pie chart
ax2.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=colors)
ax2.set_title('Distribution (%)', fontsize=14, fontweight='bold')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.show()
Plotly Interactive Dashboards
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def create_volume_dashboard(df):
"""Interactive PACS volume dashboard."""
daily = df.groupby([df['StudyDate'].dt.date, 'Modality']).size().reset_index(name='Count')
fig = make_subplots(
rows=2, cols=2,
subplot_titles=('Daily Volumes', 'Modality Distribution', 'Hourly Pattern', 'Weekly Pattern'),
specs=[[{"type": "scatter"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "bar"}]]
)
# Time series by modality
for mod in daily['Modality'].unique():
mod_data = daily[daily['Modality'] == mod]
fig.add_trace(go.Scatter(x=mod_data.iloc[:, 0], y=mod_data['Count'], name=mod, mode='lines'), row=1, col=1)
# Modality totals
totals = df['Modality'].value_counts()
fig.add_trace(go.Bar(x=totals.index, y=totals.values, marker_color='lightblue'), row=1, col=2)
# Hourly pattern
hourly = df.groupby(df['StudyDate'].dt.hour).size()
fig.add_trace(go.Bar(x=hourly.index, y=hourly.values, marker_color='lightgreen'), row=2, col=1)
# Day of week
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow = df.groupby(df['StudyDate'].dt.day_name()).size().reindex(dow_order)
fig.add_trace(go.Bar(x=dow.index, y=dow.values, marker_color='coral'), row=2, col=2)
fig.update_layout(height=800, title_text="PACS Volume Dashboard", showlegend=True)
return fig
4. Jupyter Notebook Best Practices
Standard Notebook Structure
# Cell 1: Setup and Configuration
"""
Analysis Title - Date
Author: User
Purpose: [Clear objective]
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
print(f"Analysis started: {datetime.now()}")
# Cell 2: Data Loading
DATA_PATH = '/path/to/data.csv'
df = pd.read_csv(DATA_PATH, parse_dates=['DateCol'])
print(f"Loaded {len(df):,} records")
df.head()
# Cell 3: Data Quality Check
print(f"Missing values:\n{df.isnull().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
# Cell 4: Analysis
# ... analysis code ...
# Cell 5: Visualization
# ... plots ...
# Cell 6: Summary
print("\nKEY FINDINGS:")
print("1. ...")
Parameterized Notebooks (papermill)
# Tag cell as 'parameters'
START_DATE = '2025-01-01'
END_DATE = '2025-12-31'
MODALITIES = ['CT', 'MR']
OUTPUT_PATH = '/path/to/reports/'
# Run: papermill notebook.ipynb output.ipynb -p START_DATE '2025-06-01'
Notebook to Production Script
#!/path/to/venv/bin/python
"""Production script converted from notebook."""
import argparse
import logging
import pandas as pd
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def load_data(path):
logging.info(f"Loading {path}")
return pd.read_csv(path, parse_dates=['StudyDate'])
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
args = parser.parse_args()
df = load_data(args.input)
# Process...
if __name__ == '__main__':
main()
5. Automated Reporting
HTML Report Generation
def generate_weekly_report(df, week_start, week_end):
"""Generate HTML weekly report."""
week_df = df[(df['StudyDate'] >= week_start) & (df['StudyDate'] <= week_end)]
total = len(week_df)
by_modality = week_df['Modality'].value_counts()
daily_avg = total / 7
# Compare to previous week
prev_start = week_start - timedelta(days=7)
prev_df = df[(df['StudyDate'] >= prev_start) & (df['StudyDate'] < week_start)]
prev_total = len(prev_df)
pct_change = ((total - prev_total) / prev_total * 100) if prev_total > 0 else 0
html = f"""
<html>
<head><style>
body {{ font-family: Arial, sans-serif; }}
table {{ border-collapse: collapse; width: 100%; }}
th, td {{ border: 1px solid #ddd; padding: 8px; }}
th {{ background-color: #3498db; color: white; }}
</style></head>
<body>
<h1>Weekly Volume Report</h1>
<p>Week: {week_start:%Y-%m-%d} to {week_end:%Y-%m-%d}</p>
<h2>Summary</h2>
<p>Total Studies: {total:,} ({pct_change:+.1f}% vs prior week)</p>
<p>Daily Average: {daily_avg:.0f}</p>
<h2>By Modality</h2>
<table><tr><th>Modality</th><th>Count</th><th>%</th></tr>
"""
for mod, count in by_modality.items():
html += f"<tr><td>{mod}</td><td>{count:,}</td><td>{count/total*100:.1f}%</td></tr>"
html += f"</table><p><em>Generated: {datetime.now()}</em></p></body></html>"
return html
6. Performance Optimization
Best Practices
# Vectorization over loops
df['result'] = df['col1'] * df['col2'] # Good
# for i in range(len(df)): df.loc[i, 'result'] = ... # Bad
# Chunked reading for large files
for chunk in pd.read_csv('large.csv', chunksize=100000):
process(chunk)
# Efficient dtypes
df['category_col'] = df['category_col'].astype('category')
df['int_col'] = df['int_col'].astype('int32')
# Query over boolean indexing for complex filters
df.query('age > 30 and status == "active"')
# Memory cleanup
del large_df
import gc; gc.collect()
7. Medical Imaging Specifics
DICOM Extraction
import pydicom
import os
def extract_dicom_metadata(dicom_dir):
"""Extract metadata from DICOM directory."""
metadata = []
for root, _, files in os.walk(dicom_dir):
for f in files:
if f.endswith('.dcm'):
try:
ds = pydicom.dcmread(os.path.join(root, f))
metadata.append({
'StudyInstanceUID': str(ds.StudyInstanceUID),
'Modality': str(ds.Modality),
'StudyDate': str(ds.StudyDate),
'PatientID': str(ds.PatientID),
'StudyDescription': str(getattr(ds, 'StudyDescription', ''))
})
except Exception as e:
print(f"Error: {e}")
return pd.DataFrame(metadata)
Best Practices Summary
| Category | Practice |
|---|---|
| Code Quality | Type hints, docstrings, try/except, logging |
| Performance | Vectorize, chunk large files, efficient dtypes |
| Reproducibility | Random seeds, version tracking, parameterization |
| Healthcare | Validate assumptions, handle missing data, communicate uncertainty |
Communication Style
- Provide working code with actual data patterns
- Explain statistical methodology and assumptions
- Flag data quality issues proactively
- Suggest appropriate visualizations
- Include interpretation, not just code
- Follow "Actually Works" protocol
Last updated: 2024-12-23
Weekly Installs
1
Repository
seqis/openclaw-…ude-codeGitHub Stars
31
First Seen
13 days ago
Security Audits
Installed on
amp1
cline1
openclaw1
opencode1
cursor1
kimi-cli1