feature-engineering
SKILL.md
Feature Engineering
Creating informative features for ML models.
Feature Types
Numerical Features
from sklearn.preprocessing import StandardScaler, RobustScaler
# Scaling
scaler = StandardScaler() # Mean=0, Std=1
robust_scaler = RobustScaler() # Robust to outliers
# Log transform (for skewed data)
df['log_income'] = np.log1p(df['income'])
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# Binning
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 55, 100],
labels=['youth', 'young_adult', 'middle', 'senior'])
Categorical Features
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# One-hot encoding
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded = ohe.fit_transform(df[['category']])
# Target encoding
def target_encode(df, col, target, smoothing=10):
global_mean = df[target].mean()
agg = df.groupby(col)[target].agg(['mean', 'count'])
smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
return df[col].map(smooth)
# Hash encoding (for high cardinality)
from sklearn.feature_extraction import FeatureHasher
hasher = FeatureHasher(n_features=100, input_type='string')
Text Features
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
text_features = tfidf.fit_transform(df['text'])
# Embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist())
# Text statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
df['avg_word_length'] = df['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]))
Temporal Features
# Datetime components
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# Cyclical encoding
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
# Lag features
df['lag_1'] = df['value'].shift(1)
df['lag_7'] = df['value'].shift(7)
df['rolling_mean_7'] = df['value'].rolling(window=7).mean()
Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif
# Filter method
selector = SelectKBest(mutual_info_classif, k=50)
X_selected = selector.fit_transform(X, y)
# Embedded method (tree importance)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=feature_names)
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=20)
X_rfe = rfe.fit_transform(X, y)
Feature Store
from feast import Entity, FeatureView, Feature, FileSource
# Define feature view
user_features = FeatureView(
name="user_features",
entities=["user_id"],
features=[
Feature(name="total_purchases", dtype=Float32),
Feature(name="avg_order_value", dtype=Float32),
],
ttl=timedelta(days=1),
source=FileSource(path="data/user_features.parquet")
)
# Get features for training
training_df = store.get_historical_features(
entity_df=entity_df,
features=["user_features:total_purchases"]
).to_df()
# Get features for inference
online_features = store.get_online_features(
entity_rows=[{"user_id": 123}],
features=["user_features:total_purchases"]
)
Commands
/omgfeature:extract- Extract features/omgfeature:select- Select features/omgfeature:store- Feature store ops
Best Practices
- Start with simple features
- Use domain knowledge
- Validate feature distributions
- Document feature definitions
- Monitor feature drift
Weekly Installs
1
Repository
doanchienthangdev/omgkitGitHub Stars
3
First Seen
6 days ago
Security Audits
Installed on
zencoder1
amp1
cline1
openclaw1
opencode1
cursor1