Spaces:

Ciroc0
/

dmi-collector

Running

App Files Files Community

Ciroc0 commited on Mar 9

Commit

f3496a6

1 Parent(s): 863d586

Change for frontend

Browse files

Files changed (1) hide show

app.py +99 -11

app.py CHANGED Viewed

@@ -30,6 +30,9 @@ FRONTEND_SNAPSHOT_FILE = "frontend_snapshot.json"
 COPENHAGEN_TZ = ZoneInfo("Europe/Copenhagen")
 APP_NAME = "dmi-collector"
 faulthandler.enable()
@@ -296,7 +299,7 @@ def normalize_prediction_df(pred_df):
     pred_df = dedupe_rows(
         pred_df,
         PREDICTION_DEDUP_KEYS,
-        sort_keys=["target_timestamp", "prediction_made_at", "reference_time"],
         keep="last",
     )
     return pred_df
@@ -309,8 +312,27 @@ def merge_prediction_history(existing_df, new_df):
     if new_df is None or len(new_df) == 0:
         return normalize_prediction_df(existing_df)
-    combined = pd.concat([existing_df, new_df], ignore_index=True)
-    return normalize_prediction_df(combined)
 # =============================================================================
@@ -815,7 +837,7 @@ def build_recent_backtest(training_df):
     current_time = now_cph()
     history = training_df[
-        (training_df["target_timestamp"] >= current_time - timedelta(days=7))
         & (training_df["target_timestamp"] <= current_time)
     ].copy()
     if len(history) == 0:
@@ -823,7 +845,7 @@ def build_recent_backtest(training_df):
     if "lead_time_hours" in history.columns:
         history = history[
-            history["lead_time_hours"].fillna(0).between(0.0001, 48, inclusive="both")
         ].copy()
     if len(history) == 0:
         return None
@@ -888,6 +910,68 @@ def build_recent_backtest(training_df):
     return history
 def calculate_verification_metrics(predictions_df=None, backtest_df=None):
     """Compute frontend-facing verification summary."""
     source_df = None
@@ -1314,6 +1398,10 @@ def build_frontend_snapshot():
     training_df, _ = load_existing_training_matrix()
     registry = load_json_from_dataset("model_registry.json", DATASET_NAME) or {}
     model_meta = load_json_from_dataset("model_meta.json", DATASET_NAME) or {}
     target_status = build_target_status(registry)
     backtest_df = build_recent_backtest(training_df)
@@ -1334,9 +1422,6 @@ def build_frontend_snapshot():
     current = build_current_payload(current_row)
     feature_importance = []
-    registry_revision = registry.get("generated_at")
-    if registry_revision:
-        clear_model_bundle_cache(registry_revision)
     for target_name in MODEL_FILES:
         feature_importance.extend(
             extract_feature_importance_from_bundle(
@@ -1391,11 +1476,11 @@ def publish_frontend_snapshot():
 # BACKFILL OPERATIONS
 # =============================================================================
 def backfill_historical_data():
-    """Backfill historical data from 2025-11-01 to now."""
     log_event("backfill_historical_data entered")
     init_dataset_if_needed()
-    start_date = datetime(2025, 11, 1).date()
     end_date = now_cph().date()
     print(f"🔄 Fetching from {start_date} to {end_date}")
@@ -1559,7 +1644,10 @@ def predict_with_bundle(bundle, df):
                     log_event("predict_with_bundle missing_features", bucket=bucket, missing_columns=missing_cols)
                     continue
                 X = bucket_df[feature_cols].fillna(0.0)
-                bucket_pred = model.predict(X)
                 predictions[bucket_mask] = bucket_pred
     return predictions

 COPENHAGEN_TZ = ZoneInfo("Europe/Copenhagen")
 APP_NAME = "dmi-collector"
+HISTORICAL_BACKFILL_START = datetime(2025, 11, 1).date()
+TRAINING_HOLDOUT_DAYS = 7
+FUTURE_FORECAST_HOURS = 48
 faulthandler.enable()
     pred_df = dedupe_rows(
         pred_df,
         PREDICTION_DEDUP_KEYS,
+        sort_keys=["target_timestamp", "_merge_priority", "prediction_made_at", "reference_time"],
         keep="last",
     )
     return pred_df
     if new_df is None or len(new_df) == 0:
         return normalize_prediction_df(existing_df)
+    existing = normalize_prediction_df(existing_df).copy()
+    incoming = normalize_prediction_df(new_df).copy()
+    existing["_merge_priority"] = 0
+    incoming["_merge_priority"] = 1
+    combined = pd.concat([existing, incoming], ignore_index=True, sort=False)
+    combined = normalize_prediction_df(combined)
+    sort_keys = [
+        key
+        for key in ["target_timestamp", "_merge_priority", "prediction_made_at", "reference_time"]
+        if key in combined.columns
+    ]
+    if sort_keys:
+        combined = combined.sort_values(sort_keys).reset_index(drop=True)
+    if "target_timestamp" in combined.columns:
+        combined = combined.drop_duplicates(subset=["target_timestamp"], keep="last").reset_index(drop=True)
+    if "_merge_priority" in combined.columns:
+        combined = combined.drop(columns=["_merge_priority"])
+    return combined
 # =============================================================================
     current_time = now_cph()
     history = training_df[
+        (training_df["target_timestamp"] >= current_time - timedelta(days=TRAINING_HOLDOUT_DAYS))
         & (training_df["target_timestamp"] <= current_time)
     ].copy()
     if len(history) == 0:
     if "lead_time_hours" in history.columns:
         history = history[
+            history["lead_time_hours"].fillna(0).between(0.0001, FUTURE_FORECAST_HOURS, inclusive="both")
         ].copy()
     if len(history) == 0:
         return None
     return history
+def rebuild_future_ml_columns(predictions_df, registry_revision=None):
+    """Recompute live ML columns from stored forecast features before publishing the frontend snapshot."""
+    if predictions_df is None or len(predictions_df) == 0 or "target_timestamp" not in predictions_df.columns:
+        return predictions_df
+    current_time = now_cph()
+    repaired = predictions_df.copy()
+    future_mask = repaired["target_timestamp"] > current_time
+    if not future_mask.any():
+        return repaired
+    future_df = repaired.loc[future_mask].copy()
+    future_df["ml_temp"] = future_df.get("ml_temp", future_df.get("dmi_temperature_2m_pred"))
+    future_df["ml_wind_speed"] = future_df.get("ml_wind_speed", future_df.get("dmi_windspeed_10m_pred"))
+    future_df["ml_wind_gust"] = future_df.get("ml_wind_gust", future_df.get("dmi_windgusts_10m_pred"))
+    future_df["ml_rain_prob"] = future_df.get(
+        "ml_rain_prob",
+        future_df.get("dmi_precipitation_probability_pred", pd.Series(0.0, index=future_df.index))
+        .fillna(0.0)
+        .clip(0.0, 100.0)
+        / 100.0,
+    )
+    future_df["ml_rain_amount"] = future_df.get(
+        "ml_rain_amount",
+        future_df.get("dmi_precipitation_pred", pd.Series(0.0, index=future_df.index)).fillna(0.0).clip(0.0, None),
+    )
+    target_specs = [
+        ("temperature", "ml_temp", "dmi_temperature_2m_pred", True),
+        ("wind_speed", "ml_wind_speed", "dmi_windspeed_10m_pred", True),
+        ("wind_gust", "ml_wind_gust", "dmi_windgusts_10m_pred", True),
+        ("rain_event", "ml_rain_prob", None, False),
+        ("rain_amount", "ml_rain_amount", None, False),
+    ]
+    for target_name, output_col, baseline_col, is_correction in target_specs:
+        bundle = load_model_bundle(target_name, cache_revision=registry_revision)
+        target_pred = predict_with_bundle(bundle, future_df)
+        if target_pred is None:
+            continue
+        target_series = pd.Series(target_pred, index=future_df.index, dtype="float64")
+        target_mask = target_series.notna()
+        if not target_mask.any():
+            continue
+        if is_correction:
+            future_df.loc[target_mask, output_col] = future_df.loc[target_mask, baseline_col] + target_series[target_mask]
+        elif target_name == "rain_event":
+            future_df.loc[target_mask, output_col] = target_series[target_mask].clip(0.0, 1.0)
+        else:
+            future_df.loc[target_mask, output_col] = target_series[target_mask].clip(0.0, None)
+    future_df["ml_rain_prob"] = future_df["ml_rain_prob"].fillna(0.0).clip(0.0, 1.0)
+    future_df["ml_rain_amount"] = future_df["ml_rain_amount"].fillna(0.0).clip(0.0, None)
+    for column_name in future_df.columns:
+        if column_name not in repaired.columns:
+            repaired[column_name] = np.nan
+    repaired.loc[future_mask, future_df.columns] = future_df
+    return repaired
 def calculate_verification_metrics(predictions_df=None, backtest_df=None):
     """Compute frontend-facing verification summary."""
     source_df = None
     training_df, _ = load_existing_training_matrix()
     registry = load_json_from_dataset("model_registry.json", DATASET_NAME) or {}
     model_meta = load_json_from_dataset("model_meta.json", DATASET_NAME) or {}
+    registry_revision = registry.get("generated_at")
+    if registry_revision:
+        clear_model_bundle_cache(registry_revision)
+    predictions_df = rebuild_future_ml_columns(predictions_df, registry_revision=registry_revision)
     target_status = build_target_status(registry)
     backtest_df = build_recent_backtest(training_df)
     current = build_current_payload(current_row)
     feature_importance = []
     for target_name in MODEL_FILES:
         feature_importance.extend(
             extract_feature_importance_from_bundle(
 # BACKFILL OPERATIONS
 # =============================================================================
 def backfill_historical_data():
+    """Backfill historical data from the agreed historical start date to now."""
     log_event("backfill_historical_data entered")
     init_dataset_if_needed()
+    start_date = HISTORICAL_BACKFILL_START
     end_date = now_cph().date()
     print(f"🔄 Fetching from {start_date} to {end_date}")
                     log_event("predict_with_bundle missing_features", bucket=bucket, missing_columns=missing_cols)
                     continue
                 X = bucket_df[feature_cols].fillna(0.0)
+                if hasattr(model, "predict_proba"):
+                    bucket_pred = model.predict_proba(X)[:, 1]
+                else:
+                    bucket_pred = model.predict(X)
                 predictions[bucket_mask] = bucket_pred
     return predictions