#!/usr/bin/env python3 """Self-contained tuning script for RunPod GPU. Downloads training data from HF Hub, tunes XGBoost + LightGBM with GPU, builds ensemble, and uploads results back to HF Hub. Usage on RunPod: # One-liner to download and run: pip install xgboost lightgbm optuna pandas pyarrow huggingface_hub scikit-learn joblib loguru && \ python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('datamatters24/f1-race-data', 'training_dataset.parquet', repo_type='dataset', local_dir='/workspace')" && \ python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('datamatters24/f1-race-predictor-model', 'runpod_tune.py', local_dir='/workspace')" && \ cd /workspace && python3 runpod_tune.py """ import json import os from pathlib import Path import joblib import numpy as np import optuna import pandas as pd from loguru import logger from sklearn.metrics import log_loss # ── Config ────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_REPO = "datamatters24/f1-race-predictor-model" N_TRIALS = 200 # Feature columns must match feature_store.py exactly FEATURE_COLUMNS = [ "grid_position", "rolling_avg_finish_3", "rolling_avg_finish_5", "season_points_pct", "track_avg_finish", "track_best_finish", "track_starts", "dnf_rate_rolling_10", "quali_delta_teammate", "practice_pace_pct", "tyre_deg_rate", "constructor_avg_finish_5", "constructor_dnf_rate_10", "constructor_season_points_pct", "constructor_avg_speed_trap", "pit_stop_avg_ms", "air_temp", "track_temp", "rainfall", "safety_car_prob", "grid_position_sq", ] TARGET = "is_winner" # Detect GPU — XGBoost 3.x uses device="cuda" + tree_method="hist" HAS_GPU = False try: import xgboost as xgb xgb_gpu = xgb.XGBClassifier(tree_method="hist", device="cuda", n_estimators=1) xgb_gpu.fit(np.array([[1, 2], [3, 4]]), np.array([0, 1])) HAS_GPU = True logger.info("GPU detected — using device='cuda'") except Exception as e: logger.info(f"No GPU — using CPU ({e})") XGB_DEVICE = "cuda" if HAS_GPU else "cpu" # LightGBM GPU requires OpenCL, not CUDA — always use CPU (fast enough) LGBM_DEVICE = "cpu" # ── Data loading ──────────────────────────────────────────────────────── def load_data(): path = Path("/workspace/training_dataset.parquet") if not path.exists(): from huggingface_hub import hf_hub_download path = hf_hub_download( "datamatters24/f1-race-data", "training_dataset.parquet", repo_type="dataset", local_dir="/workspace", ) df = pd.read_parquet(path) logger.info(f"Loaded {len(df):,} rows, {len(df.columns)} columns") return df def time_series_split(df): train = df[(df["year"] >= 2014) & (df["year"] <= 2022)] val = df[(df["year"] == 2023)] test = df[(df["year"] >= 2024) & (df["year"] <= 2025)] return train, val, test def top_k_accuracy(y_true, y_prob, race_ids, k=3): y_true = np.asarray(y_true) y_prob = np.asarray(y_prob) race_ids = np.asarray(race_ids) hits = total = 0 for rid in np.unique(race_ids): mask = race_ids == rid probs, true = y_prob[mask], y_true[mask] if true.sum() == 0: continue top_k_idx = np.argsort(probs)[-k:] if true[top_k_idx].sum() > 0: hits += 1 total += 1 return hits / total if total > 0 else 0.0 def evaluate(y_true, y_prob, race_ids, name): from sklearn.metrics import brier_score_loss m = { "split": name, "log_loss": log_loss(y_true, y_prob), "brier_score": brier_score_loss(y_true, y_prob), "top1_accuracy": top_k_accuracy(y_true, y_prob, race_ids, k=1), "top3_accuracy": top_k_accuracy(y_true, y_prob, race_ids, k=3), "top5_accuracy": top_k_accuracy(y_true, y_prob, race_ids, k=5), "n_races": len(np.unique(race_ids)), } logger.info( f" {name}: top1={m['top1_accuracy']:.1%}, " f"top3={m['top3_accuracy']:.1%}, top5={m['top5_accuracy']:.1%}, " f"log_loss={m['log_loss']:.4f} ({m['n_races']} races)" ) return m # ── XGBoost tuning ────────────────────────────────────────────────────── def xgb_objective(trial, X_train, y_train, X_val, y_val): import xgboost as xgb params = { "n_estimators": 500, "tree_method": "hist", "device": XGB_DEVICE, "max_depth": trial.suggest_int("max_depth", 3, 8), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True), "subsample": trial.suggest_float("subsample", 0.6, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), "gamma": trial.suggest_float("gamma", 0.0, 1.0), "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), "scale_pos_weight": (y_train == 0).sum() / max((y_train == 1).sum(), 1), "eval_metric": "logloss", "early_stopping_rounds": 30, "random_state": 42, } model = xgb.XGBClassifier(**params) model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) return log_loss(y_val, model.predict_proba(X_val)[:, 1]) # ── LightGBM tuning ──────────────────────────────────────────────────── def lgbm_objective(trial, X_train, y_train, X_val, y_val): import lightgbm as lgb params = { "n_estimators": 500, "device": LGBM_DEVICE, "max_depth": trial.suggest_int("max_depth", 3, 8), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True), "subsample": trial.suggest_float("subsample", 0.6, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10.0, log=True), "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), "num_leaves": trial.suggest_int("num_leaves", 15, 63), "scale_pos_weight": (y_train == 0).sum() / max((y_train == 1).sum(), 1), "metric": "binary_logloss", "random_state": 42, "verbose": -1, } model = lgb.LGBMClassifier(**params) model.fit( X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(30, verbose=False)], ) return log_loss(y_val, model.predict_proba(X_val)[:, 1]) # ── Main ──────────────────────────────────────────────────────────────── def main(): import xgboost as xgb import lightgbm as lgb optuna.logging.set_verbosity(optuna.logging.WARNING) df = load_data() train, val, test = time_series_split(df) X_train, y_train = train[FEATURE_COLUMNS], train[TARGET] X_val, y_val = val[FEATURE_COLUMNS], val[TARGET] X_test, y_test = test[FEATURE_COLUMNS], test[TARGET] X_trainval = pd.concat([X_train, X_val]) y_trainval = pd.concat([y_train, y_val]) results = {} out_dir = Path("/workspace/models") out_dir.mkdir(exist_ok=True) # === XGBoost === logger.info(f"=== Tuning XGBoost ({N_TRIALS} trials, device={XGB_DEVICE}) ===") xgb_study = optuna.create_study(direction="minimize") xgb_study.optimize( lambda t: xgb_objective(t, X_train, y_train, X_val, y_val), n_trials=N_TRIALS, ) logger.info(f"XGB best val log_loss: {xgb_study.best_value:.4f}") logger.info(f"XGB best params: {xgb_study.best_params}") # Retrain on train+val xgb_params = dict(xgb_study.best_params) xgb_params.update({ "n_estimators": 500, "tree_method": "hist", "device": XGB_DEVICE, "scale_pos_weight": (y_trainval == 0).sum() / max((y_trainval == 1).sum(), 1), "eval_metric": "logloss", "early_stopping_rounds": 30, "random_state": 42, }) xgb_model = xgb.XGBClassifier(**xgb_params) xgb_model.fit(X_trainval, y_trainval, eval_set=[(X_test, y_test)], verbose=False) xgb_probs = xgb_model.predict_proba(X_test)[:, 1] xgb_metrics = evaluate(y_test, xgb_probs, test["raceId"].values, "tuned_xgb_test") results["xgb"] = {"best_params": xgb_study.best_params, "test_metrics": xgb_metrics} # Save XGB (force CPU device for portability) xgb_model.set_params(device="cpu") joblib.dump(xgb_model, out_dir / "race_winner_xgb_tuned.joblib") # === LightGBM === logger.info(f"\n=== Tuning LightGBM ({N_TRIALS} trials) ===") lgbm_study = optuna.create_study(direction="minimize") lgbm_study.optimize( lambda t: lgbm_objective(t, X_train, y_train, X_val, y_val), n_trials=N_TRIALS, ) logger.info(f"LGBM best val log_loss: {lgbm_study.best_value:.4f}") logger.info(f"LGBM best params: {lgbm_study.best_params}") lgbm_params = dict(lgbm_study.best_params) lgbm_params.update({ "n_estimators": 500, "device": "cpu", "scale_pos_weight": (y_trainval == 0).sum() / max((y_trainval == 1).sum(), 1), "metric": "binary_logloss", "random_state": 42, "verbose": -1, }) lgbm_model = lgb.LGBMClassifier(**lgbm_params) lgbm_model.fit( X_trainval, y_trainval, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(30, verbose=False)], ) lgbm_probs = lgbm_model.predict_proba(X_test)[:, 1] lgbm_metrics = evaluate(y_test, lgbm_probs, test["raceId"].values, "tuned_lgbm_test") results["lgbm"] = {"best_params": lgbm_study.best_params, "test_metrics": lgbm_metrics} joblib.dump(lgbm_model, out_dir / "race_winner_lgbm_tuned.joblib") # === Ensemble === logger.info("\n=== Building Ensemble ===") xgb_val_probs = xgb_model.predict_proba(X_val)[:, 1] lgbm_val_probs = lgbm_model.predict_proba(X_val)[:, 1] best_w, best_score = 0.5, -1 for w in np.arange(0.1, 0.95, 0.05): blended = w * xgb_val_probs + (1 - w) * lgbm_val_probs score = top_k_accuracy(y_val, blended, val["raceId"].values, k=3) if score > best_score: best_w, best_score = w, score logger.info(f"Best ensemble: XGB={best_w:.2f}, LGBM={1-best_w:.2f} (val top3={best_score:.1%})") ensemble_probs = best_w * xgb_probs + (1 - best_w) * lgbm_probs ens_metrics = evaluate(y_test, ensemble_probs, test["raceId"].values, "ensemble_test") results["ensemble"] = {"xgb_weight": best_w, "lgbm_weight": 1 - best_w, "test_metrics": ens_metrics} ensemble_config = {"xgb_weight": best_w, "lgbm_weight": 1 - best_w, "feature_columns": FEATURE_COLUMNS} joblib.dump(ensemble_config, out_dir / "ensemble_config.joblib") # Save results with open(out_dir / "best_params.json", "w") as f: json.dump(results, f, indent=2) logger.info(f"\nAll results saved to {out_dir}") # === Upload to HF Hub === if HF_TOKEN: logger.info("\n=== Uploading to HF Hub ===") from huggingface_hub import HfApi api = HfApi(token=HF_TOKEN) for fname in ["race_winner_xgb_tuned.joblib", "race_winner_lgbm_tuned.joblib", "ensemble_config.joblib", "best_params.json"]: api.upload_file( path_or_fileobj=str(out_dir / fname), path_in_repo=fname, repo_id=HF_REPO, ) logger.info(f" Uploaded {fname}") logger.info("Done! Models on HF Hub.") else: logger.info("\nNo HF_TOKEN set — skipping upload. Download models manually from /workspace/models/") # Print summary logger.info(f"\n{'='*60}") logger.info("SUMMARY") logger.info(f"{'='*60}") logger.info(f"XGBoost: top1={xgb_metrics['top1_accuracy']:.1%}, top3={xgb_metrics['top3_accuracy']:.1%}, top5={xgb_metrics['top5_accuracy']:.1%}") logger.info(f"LightGBM: top1={lgbm_metrics['top1_accuracy']:.1%}, top3={lgbm_metrics['top3_accuracy']:.1%}, top5={lgbm_metrics['top5_accuracy']:.1%}") logger.info(f"Ensemble: top1={ens_metrics['top1_accuracy']:.1%}, top3={ens_metrics['top3_accuracy']:.1%}, top5={ens_metrics['top5_accuracy']:.1%}") logger.info(f"Weights: XGB={best_w:.2f}, LGBM={1-best_w:.2f}") if __name__ == "__main__": main()