data-science
SKILL.md
Data Science Guide
Statistical modeling, experimentation, and advanced analytics.
When to Use
- Designing A/B tests and experiments
- Building predictive models
- Performing causal analysis
- Feature engineering
- Statistical hypothesis testing
Tech Stack
| Category | Tools |
|---|---|
| Languages | Python, SQL, R |
| Analysis | NumPy, Pandas, SciPy |
| ML | Scikit-learn, XGBoost, LightGBM |
| Visualization | Matplotlib, Seaborn, Plotly |
| Statistics | Statsmodels, PyMC |
| Notebooks | Jupyter, VS Code |
Experiment Design
A/B Test Framework
import scipy.stats as stats
import numpy as np
def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
"""Calculate required sample size for A/B test."""
effect_size = mde / np.sqrt(baseline_rate * (1 - baseline_rate))
analysis = stats.TTestIndPower()
return int(analysis.solve_power(
effect_size=effect_size,
alpha=alpha,
power=power,
alternative='two-sided'
))
# Example: 5% baseline, 10% relative lift
n = calculate_sample_size(0.05, 0.005)
print(f"Required sample size per group: {n}")
Statistical Significance
def analyze_ab_test(control, treatment):
"""Analyze A/B test results."""
# Two-proportion z-test
n1, n2 = len(control), len(treatment)
p1, p2 = control.mean(), treatment.mean()
p_pool = (control.sum() + treatment.sum()) / (n1 + n2)
se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))
z = (p2 - p1) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
return {
'control_rate': p1,
'treatment_rate': p2,
'lift': (p2 - p1) / p1,
'p_value': p_value,
'significant': p_value < 0.05
}
Feature Engineering
Common Patterns
import pandas as pd
from sklearn.preprocessing import StandardScaler
def engineer_features(df):
"""Feature engineering pipeline."""
# Temporal features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6])
# Aggregations
df['user_avg_spend'] = df.groupby('user_id')['amount'].transform('mean')
df['user_transaction_count'] = df.groupby('user_id')['amount'].transform('count')
# Ratios
df['spend_vs_avg'] = df['amount'] / df['user_avg_spend']
return df
Feature Selection
from sklearn.feature_selection import mutual_info_classif
def select_features(X, y, k=10):
"""Select top k features by mutual information."""
mi_scores = mutual_info_classif(X, y)
top_k = np.argsort(mi_scores)[-k:]
return X.columns[top_k].tolist()
Model Evaluation
Cross-Validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
def evaluate_model(model, X, y):
"""Robust model evaluation."""
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {
'accuracy': cross_val_score(model, X, y, cv=cv, scoring='accuracy'),
'precision': cross_val_score(model, X, y, cv=cv, scoring='precision'),
'recall': cross_val_score(model, X, y, cv=cv, scoring='recall'),
'auc': cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
}
return {k: f"{v.mean():.3f} (+/- {v.std()*2:.3f})" for k, v in scores.items()}
Causal Inference
Propensity Score Matching
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
def propensity_matching(df, treatment_col, features):
"""Match treatment and control using propensity scores."""
# Estimate propensity scores
ps_model = LogisticRegression()
ps_model.fit(df[features], df[treatment_col])
df['propensity'] = ps_model.predict_proba(df[features])[:, 1]
# Match nearest neighbors
treated = df[df[treatment_col] == 1]
control = df[df[treatment_col] == 0]
nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['propensity']])
distances, indices = nn.kneighbors(treated[['propensity']])
return treated, control.iloc[indices.flatten()]
Best Practices
Analysis Workflow
- Define hypothesis clearly
- Calculate required sample size
- Design experiment (randomization)
- Collect data with quality checks
- Analyze with appropriate tests
- Report with confidence intervals
Common Pitfalls
- Multiple comparisons without correction
- Peeking at results before sample size reached
- Simpson's paradox in aggregations
- Survivorship bias in cohort analysis
- Correlation vs causation confusion
Weekly Installs
33
Repository
eyadsibai/ltkFirst Seen
Jan 28, 2026
Security Audits
Installed on
gemini-cli27
opencode26
claude-code23
github-copilot23
codex23
antigravity21