Spaces:

Ciroc0
/

dmi-collector

Running

App Files Files Community

Ciroc0 commited on Mar 9

Commit

97db48a

1 Parent(s): a66d87c

Fix collector data flow and space metadata

Browse files

Files changed (2) hide show

README.md +3 -0
app.py +340 -84

README.md CHANGED Viewed

@@ -1,5 +1,8 @@
 ---
 title: DMI Aarhus Collector
 sdk: gradio
 sdk_version: 5.47.2
 python_version: "3.10"

 ---
 title: DMI Aarhus Collector
+emoji: "🌤️"
+colorFrom: blue
+colorTo: green
 sdk: gradio
 sdk_version: 5.47.2
 python_version: "3.10"

app.py CHANGED Viewed

@@ -68,6 +68,9 @@ OBSERVATION_FEATURES = [
     "windgusts_10m",
 ]
 class LazyModule:
     def __init__(self, module_name):
@@ -213,6 +216,94 @@ def get_lead_bucket(lead_hours):
         return "25-48"
 # =============================================================================
 # FORECAST FETCHING
 # =============================================================================
@@ -295,8 +386,7 @@ def fetch_forecasts_for_period(start_date, end_date):
         return None
     df = pd.DataFrame(all_forecasts)
-    df = df.drop_duplicates(subset=['target_timestamp'], keep='first')
-    df = df.sort_values('target_timestamp').reset_index(drop=True)
     log_event("fetch_forecasts_for_period done", rows=len(df))
     return df
@@ -527,14 +617,7 @@ def build_training_matrix(forecasts_df, observations_df):
         log_event("build_training_matrix no_matches")
         return None
-    # Add temporal features
-    merged = add_temporal_features(merged)
-    # Add run delta features
-    merged = add_run_delta_features(merged)
-    # Add observation lags (requires sorting by time)
-    merged = add_observation_lags(merged)
     # Add correction targets for temperature and wind
     merged['temp_correction_target'] = merged['actual_temp'] - merged['dmi_temperature_2m_pred']
@@ -544,6 +627,7 @@ def build_training_matrix(forecasts_df, observations_df):
     # Filter out future times
     current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
     merged = merged[merged['target_timestamp'] <= current_hour]
     log_event("build_training_matrix done", rows=len(merged), columns=len(merged.columns))
     return merged
@@ -607,7 +691,7 @@ def init_dataset_if_needed():
         print(f"✅ Training dataset already available as {existing_name}")
         return existing_path, existing_name
-    empty_df = pd.DataFrame(columns=["target_timestamp"])
     empty_df.to_parquet("training_matrix.parquet")
     if upload_to_dataset("training_matrix.parquet", "training_matrix.parquet", DATASET_NAME):
@@ -634,14 +718,9 @@ def load_existing_training_matrix():
     if existing.empty or "target_timestamp" not in existing.columns:
         return existing, existing_name
-    if existing["target_timestamp"].dt.tz is None:
-        existing["target_timestamp"] = existing["target_timestamp"].dt.tz_localize(
-            "Europe/Copenhagen",
-            ambiguous="infer",
-        )
-    else:
-        existing["target_timestamp"] = existing["target_timestamp"].dt.tz_convert("Europe/Copenhagen")
     return existing, existing_name
@@ -695,8 +774,7 @@ def backfill_historical_data():
         return "❌ No data collected"
     final_df = pd.concat(all_data, ignore_index=True)
-    final_df = final_df.drop_duplicates(subset=['target_timestamp'], keep='first')
-    final_df = final_df.sort_values('target_timestamp').reset_index(drop=True)
     # Save to parquet
     final_df.to_parquet("training_matrix.parquet")
@@ -740,15 +818,13 @@ def update_daily():
     try:
         existing, existing_name = load_existing_training_matrix()
         if existing is not None and not existing.empty and "target_timestamp" in existing.columns:
-            existing_ts = set(existing["target_timestamp"])
-            mask = ~merged["target_timestamp"].isin(existing_ts)
-            new_data = merged[mask]
             if len(new_data) == 0:
                 return f"ℹ️ No new data ({existing_name} already up to date)"
             combined = pd.concat([existing, new_data], ignore_index=True)
-            combined = combined.drop_duplicates(subset=["target_timestamp"], keep="first")
             status_msg = f"✅ Added {len(new_data)} new rows"
         else:
             combined = merged
@@ -799,24 +875,27 @@ def predict_with_bundle(bundle, df):
     """Make predictions using model bundle."""
     if bundle is None or 'models' not in bundle:
         return None
-    results = df.copy()
-    predictions = np.zeros(len(df))
     for bucket in df['lead_bucket'].unique():
         if bucket in bundle['models']:
             bucket_mask = df['lead_bucket'] == bucket
             bucket_df = df[bucket_mask]
             model_info = bundle['models'][bucket]
             model = model_info['model']
             feature_cols = model_info.get('feature_columns', [])
             if feature_cols:
                 X = bucket_df[feature_cols].fillna(0.0)
                 bucket_pred = model.predict(X)
                 predictions[bucket_mask] = bucket_pred
     return predictions
@@ -848,45 +927,60 @@ def generate_predictions():
         log_exception("generate_predictions model_registry", e)
         registry = None
-    results = future_forecasts.copy()
     results['prediction_made_at'] = current_time
     results['city'] = 'aarhus'
     # Temperature prediction
     temp_bundle = load_model_bundle('temperature', cache_revision=registry_revision)
     if temp_bundle:
-        temp_pred = predict_with_bundle(temp_bundle, future_forecasts)
         if temp_pred is not None:
-            # Correction model: ml_pred = dmi_pred + correction
-            results['ml_temp'] = results['dmi_temperature_2m_pred'] + temp_pred
     # Wind speed prediction
     wind_bundle = load_model_bundle('wind_speed', cache_revision=registry_revision)
     if wind_bundle:
-        wind_pred = predict_with_bundle(wind_bundle, future_forecasts)
         if wind_pred is not None:
-            results['ml_wind_speed'] = results['dmi_windspeed_10m_pred'] + wind_pred
     # Wind gust prediction
     gust_bundle = load_model_bundle('wind_gust', cache_revision=registry_revision)
     if gust_bundle:
-        gust_pred = predict_with_bundle(gust_bundle, future_forecasts)
         if gust_pred is not None:
-            results['ml_wind_gust'] = results['dmi_windgusts_10m_pred'] + gust_pred
     # Rain event prediction
     rain_event_bundle = load_model_bundle('rain_event', cache_revision=registry_revision)
     if rain_event_bundle:
-        rain_event_pred = predict_with_bundle(rain_event_bundle, future_forecasts)
         if rain_event_pred is not None:
-            results['ml_rain_prob'] = rain_event_pred
     # Rain amount prediction
     rain_amount_bundle = load_model_bundle('rain_amount', cache_revision=registry_revision)
     if rain_amount_bundle:
-        rain_amount_pred = predict_with_bundle(rain_amount_bundle, future_forecasts)
         if rain_amount_pred is not None:
-            results['ml_rain_amount'] = np.clip(rain_amount_pred, 0, None)
     # Add verification fields
     results['verified'] = False
@@ -894,6 +988,12 @@ def generate_predictions():
     results['actual_wind_speed'] = None
     results['actual_wind_gust'] = None
     results['actual_precipitation'] = None
     # Save to parquet
     results.to_parquet("predictions_latest.parquet")
@@ -971,6 +1071,150 @@ def verify_predictions():
 # =============================================================================
 # STARTUP CATCH-UP
 # =============================================================================
 def load_predictions_snapshot():
     pred_path, _ = load_first_available_dataset_file(
         ["predictions_latest.parquet", "predictions.parquet"],
@@ -979,18 +1223,55 @@ def load_predictions_snapshot():
     if not pred_path:
         return None
-    pred_df = pd.read_parquet(pred_path)
-    if 'timestamp' in pred_df.columns and 'target_timestamp' not in pred_df.columns:
-        pred_df = pred_df.rename(columns={'timestamp': 'target_timestamp'})
-    if 'target_timestamp' not in pred_df.columns:
         return None
-    if pred_df['target_timestamp'].dt.tz is None:
-        pred_df['target_timestamp'] = pred_df['target_timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
-    else:
-        pred_df['target_timestamp'] = pred_df['target_timestamp'].dt.tz_convert('Europe/Copenhagen')
     return pred_df
 def should_run_daily_catch_up():
     matrix_path = load_from_dataset("training_matrix.parquet", DATASET_NAME)
     if not matrix_path:
@@ -1065,33 +1346,8 @@ def run_post_start_catch_up():
 def run_scheduler():
     """Background scheduler for automated tasks."""
     log_event("scheduler starting")
-    # Every 3 hours: fetch new forecast run
-    schedule.every(3).hours.do(
-        lambda: run_logged(
-            "scheduled_fetch_forecasts",
-            fetch_forecasts_for_period,
-            now_cph().date() - timedelta(days=1),
-            now_cph().date(),
-        )
-    )
-    # Every hour: fetch new observations
-    schedule.every().hour.do(
-        lambda: run_logged(
-            "scheduled_fetch_observations",
-            fetch_observations_for_period,
-            now_cph().date() - timedelta(days=1),
-            now_cph().date(),
-        )
-    )
-    # Every 3 hours: generate new predictions
     schedule.every(3).hours.do(lambda: run_logged("scheduled_generate_predictions", generate_predictions))
-    # Every hour: verify past predictions
     schedule.every().hour.do(lambda: run_logged("scheduled_verify_predictions", verify_predictions))
-    # Daily: rebuild training matrix
     schedule.every().day.at("06:00").do(lambda: run_logged("scheduled_update_daily", update_daily))
     log_event("scheduler_registered")
@@ -1115,7 +1371,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
     """)
     app_status = gr.Markdown(build_app_status_text())
-    status = gr.Textbox(label="Status", lines=10)
     with gr.Row():
         btn_backfill = gr.Button("🚀 Backfill Historical Data", variant="primary")
@@ -1127,7 +1383,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
         try:
             result = run_logged("gradio_backfill_historical_data", backfill_historical_data)
             set_app_ready()
-            return build_app_status_text(), result
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ backfill_historical_data failed: {exc}"
@@ -1136,7 +1392,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
         try:
             result = run_logged("gradio_update_daily", update_daily)
             set_app_ready()
-            return build_app_status_text(), result
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ update_daily failed: {exc}"
@@ -1145,7 +1401,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
         try:
             result = run_logged("gradio_generate_predictions", generate_predictions)
             set_app_ready()
-            return build_app_status_text(), result
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ generate_predictions failed: {exc}"
@@ -1154,7 +1410,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
         try:
             result = run_logged("gradio_verify_predictions", verify_predictions)
             set_app_ready()
-            return build_app_status_text(), result
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ verify_predictions failed: {exc}"
@@ -1163,7 +1419,7 @@ with gr.Blocks(title="DMI Aarhus Collector") as demo:
     btn_daily.click(daily_handler, outputs=[app_status, status])
     btn_predict.click(predict_handler, outputs=[app_status, status])
     btn_verify.click(verify_handler, outputs=[app_status, status])
-    demo.load(lambda: build_app_status_text(), outputs=app_status)
 log_event("gradio_ui_ready")
 log_event("ui_constructed")

     "windgusts_10m",
 ]
+TRAINING_DEDUP_KEYS = ["reference_time", "target_timestamp"]
+PREDICTION_DEDUP_KEYS = ["target_timestamp"]
 class LazyModule:
     def __init__(self, module_name):
         return "25-48"
+def ensure_copenhagen_time(df, column_name):
+    """Ensure a datetime column is timezone-aware in Europe/Copenhagen."""
+    if column_name not in df.columns:
+        return df
+    series = pd.to_datetime(df[column_name], errors="coerce")
+    if getattr(series.dt, "tz", None) is None:
+        df[column_name] = series.dt.tz_localize(COPENHAGEN_TZ, ambiguous="infer", nonexistent="shift_forward")
+    else:
+        df[column_name] = series.dt.tz_convert(COPENHAGEN_TZ)
+    return df
+def dedupe_rows(df, dedup_keys, sort_keys=None, keep="last"):
+    """Sort and drop duplicate rows using the keys available on the dataframe."""
+    if df is None or len(df) == 0:
+        return df
+    available_sort_keys = [key for key in (sort_keys or dedup_keys) if key in df.columns]
+    if available_sort_keys:
+        df = df.sort_values(available_sort_keys).reset_index(drop=True)
+    available_dedup_keys = [key for key in dedup_keys if key in df.columns]
+    if available_dedup_keys:
+        df = df.drop_duplicates(subset=available_dedup_keys, keep=keep).reset_index(drop=True)
+    return df
+def find_new_rows(candidate_df, existing_df, dedup_keys):
+    """Return rows that are new compared with an existing dataframe."""
+    if existing_df is None or len(existing_df) == 0:
+        return candidate_df
+    keys = [key for key in dedup_keys if key in candidate_df.columns and key in existing_df.columns]
+    if not keys:
+        return candidate_df
+    existing_keys = existing_df[keys].drop_duplicates().copy()
+    existing_keys["_existing"] = True
+    merged = candidate_df.merge(existing_keys, on=keys, how="left")
+    merged = merged[merged["_existing"] != True].drop(columns=["_existing"])
+    return merged.reset_index(drop=True)
+def build_model_features(df):
+    """Build the causal feature set used by both training and live inference."""
+    if df is None or len(df) == 0:
+        return df
+    features_df = df.copy()
+    features_df = add_temporal_features(features_df)
+    features_df = add_run_delta_features(features_df)
+    return features_df
+def normalize_prediction_df(pred_df):
+    """Normalize prediction history to the current schema."""
+    if pred_df is None or len(pred_df) == 0:
+        return pred_df
+    if "timestamp" in pred_df.columns and "target_timestamp" not in pred_df.columns:
+        pred_df = pred_df.rename(columns={"timestamp": "target_timestamp"})
+    for column_name in ["target_timestamp", "reference_time", "prediction_made_at"]:
+        pred_df = ensure_copenhagen_time(pred_df, column_name)
+    if "verified" not in pred_df.columns:
+        pred_df["verified"] = False
+    pred_df["verified"] = pred_df["verified"].fillna(False).astype(bool)
+    pred_df = dedupe_rows(
+        pred_df,
+        PREDICTION_DEDUP_KEYS,
+        sort_keys=["target_timestamp", "prediction_made_at", "reference_time"],
+        keep="last",
+    )
+    return pred_df
+def merge_prediction_history(existing_df, new_df):
+    """Upsert future rows while preserving historical prediction rows."""
+    if existing_df is None or len(existing_df) == 0:
+        return normalize_prediction_df(new_df)
+    if new_df is None or len(new_df) == 0:
+        return normalize_prediction_df(existing_df)
+    combined = pd.concat([existing_df, new_df], ignore_index=True)
+    return normalize_prediction_df(combined)
 # =============================================================================
 # FORECAST FETCHING
 # =============================================================================
         return None
     df = pd.DataFrame(all_forecasts)
+    df = dedupe_rows(df, TRAINING_DEDUP_KEYS, sort_keys=["target_timestamp", "reference_time"], keep="last")
     log_event("fetch_forecasts_for_period done", rows=len(df))
     return df
         log_event("build_training_matrix no_matches")
         return None
+    merged = build_model_features(merged)
     # Add correction targets for temperature and wind
     merged['temp_correction_target'] = merged['actual_temp'] - merged['dmi_temperature_2m_pred']
     # Filter out future times
     current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
     merged = merged[merged['target_timestamp'] <= current_hour]
+    merged = dedupe_rows(merged, TRAINING_DEDUP_KEYS, sort_keys=["target_timestamp", "reference_time"], keep="last")
     log_event("build_training_matrix done", rows=len(merged), columns=len(merged.columns))
     return merged
         print(f"✅ Training dataset already available as {existing_name}")
         return existing_path, existing_name
+    empty_df = pd.DataFrame(columns=TRAINING_DEDUP_KEYS)
     empty_df.to_parquet("training_matrix.parquet")
     if upload_to_dataset("training_matrix.parquet", "training_matrix.parquet", DATASET_NAME):
     if existing.empty or "target_timestamp" not in existing.columns:
         return existing, existing_name
+    existing = ensure_copenhagen_time(existing, "target_timestamp")
+    existing = ensure_copenhagen_time(existing, "reference_time")
+    existing = dedupe_rows(existing, TRAINING_DEDUP_KEYS, sort_keys=["target_timestamp", "reference_time"], keep="last")
     return existing, existing_name
         return "❌ No data collected"
     final_df = pd.concat(all_data, ignore_index=True)
+    final_df = dedupe_rows(final_df, TRAINING_DEDUP_KEYS, sort_keys=["target_timestamp", "reference_time"], keep="last")
     # Save to parquet
     final_df.to_parquet("training_matrix.parquet")
     try:
         existing, existing_name = load_existing_training_matrix()
         if existing is not None and not existing.empty and "target_timestamp" in existing.columns:
+            new_data = find_new_rows(merged, existing, TRAINING_DEDUP_KEYS)
             if len(new_data) == 0:
                 return f"ℹ️ No new data ({existing_name} already up to date)"
             combined = pd.concat([existing, new_data], ignore_index=True)
+            combined = dedupe_rows(combined, TRAINING_DEDUP_KEYS, sort_keys=["target_timestamp", "reference_time"], keep="last")
             status_msg = f"✅ Added {len(new_data)} new rows"
         else:
             combined = merged
     """Make predictions using model bundle."""
     if bundle is None or 'models' not in bundle:
         return None
+    predictions = np.full(len(df), np.nan)
     for bucket in df['lead_bucket'].unique():
         if bucket in bundle['models']:
             bucket_mask = df['lead_bucket'] == bucket
             bucket_df = df[bucket_mask]
             model_info = bundle['models'][bucket]
             model = model_info['model']
             feature_cols = model_info.get('feature_columns', [])
             if feature_cols:
+                missing_cols = [col for col in feature_cols if col not in bucket_df.columns]
+                if missing_cols:
+                    log_event("predict_with_bundle missing_features", bucket=bucket, missing_columns=missing_cols)
+                    continue
                 X = bucket_df[feature_cols].fillna(0.0)
                 bucket_pred = model.predict(X)
                 predictions[bucket_mask] = bucket_pred
     return predictions
         log_exception("generate_predictions model_registry", e)
         registry = None
+    feature_frame = build_model_features(future_forecasts)
+    results = feature_frame.copy()
     results['prediction_made_at'] = current_time
     results['city'] = 'aarhus'
+    results['verified'] = False
+    results['ml_temp'] = results['dmi_temperature_2m_pred']
+    results['ml_wind_speed'] = results['dmi_windspeed_10m_pred']
+    results['ml_wind_gust'] = results['dmi_windgusts_10m_pred']
+    results['ml_rain_prob'] = results['dmi_precipitation_probability_pred'].fillna(0.0).clip(0.0, 100.0) / 100.0
+    results['ml_rain_amount'] = results['dmi_precipitation_pred'].fillna(0.0).clip(0.0, None)
     # Temperature prediction
     temp_bundle = load_model_bundle('temperature', cache_revision=registry_revision)
     if temp_bundle:
+        temp_pred = predict_with_bundle(temp_bundle, feature_frame)
         if temp_pred is not None:
+            temp_series = pd.Series(temp_pred, index=results.index, dtype="float64")
+            temp_mask = temp_series.notna()
+            results.loc[temp_mask, 'ml_temp'] = results.loc[temp_mask, 'dmi_temperature_2m_pred'] + temp_series[temp_mask]
     # Wind speed prediction
     wind_bundle = load_model_bundle('wind_speed', cache_revision=registry_revision)
     if wind_bundle:
+        wind_pred = predict_with_bundle(wind_bundle, feature_frame)
         if wind_pred is not None:
+            wind_series = pd.Series(wind_pred, index=results.index, dtype="float64")
+            wind_mask = wind_series.notna()
+            results.loc[wind_mask, 'ml_wind_speed'] = results.loc[wind_mask, 'dmi_windspeed_10m_pred'] + wind_series[wind_mask]
     # Wind gust prediction
     gust_bundle = load_model_bundle('wind_gust', cache_revision=registry_revision)
     if gust_bundle:
+        gust_pred = predict_with_bundle(gust_bundle, feature_frame)
         if gust_pred is not None:
+            gust_series = pd.Series(gust_pred, index=results.index, dtype="float64")
+            gust_mask = gust_series.notna()
+            results.loc[gust_mask, 'ml_wind_gust'] = results.loc[gust_mask, 'dmi_windgusts_10m_pred'] + gust_series[gust_mask]
     # Rain event prediction
     rain_event_bundle = load_model_bundle('rain_event', cache_revision=registry_revision)
     if rain_event_bundle:
+        rain_event_pred = predict_with_bundle(rain_event_bundle, feature_frame)
         if rain_event_pred is not None:
+            rain_event_series = pd.Series(rain_event_pred, index=results.index, dtype="float64")
+            rain_event_mask = rain_event_series.notna()
+            results.loc[rain_event_mask, 'ml_rain_prob'] = rain_event_series[rain_event_mask]
     # Rain amount prediction
     rain_amount_bundle = load_model_bundle('rain_amount', cache_revision=registry_revision)
     if rain_amount_bundle:
+        rain_amount_pred = predict_with_bundle(rain_amount_bundle, feature_frame)
         if rain_amount_pred is not None:
+            rain_amount_series = pd.Series(rain_amount_pred, index=results.index, dtype="float64")
+            rain_amount_mask = rain_amount_series.notna()
+            results.loc[rain_amount_mask, 'ml_rain_amount'] = np.clip(rain_amount_series[rain_amount_mask], 0, None)
     # Add verification fields
     results['verified'] = False
     results['actual_wind_speed'] = None
     results['actual_wind_gust'] = None
     results['actual_precipitation'] = None
+    results['actual_rain'] = None
+    results['actual_rain_event'] = None
+    results['actual_rain_amount'] = None
+    results['ml_rain_prob'] = results['ml_rain_prob'].clip(0.0, 1.0)
+    results['ml_rain_amount'] = results['ml_rain_amount'].clip(0.0, None)
+    results = merge_prediction_history(load_predictions_snapshot(), results)
     # Save to parquet
     results.to_parquet("predictions_latest.parquet")
 # =============================================================================
 # STARTUP CATCH-UP
 # =============================================================================
+def generate_predictions():
+    """Generate predictions for all targets and preserve verified history."""
+    log_event("generate_predictions entered")
+    current_time = now_cph()
+    log_event("generate_predictions clock", current_time=str(current_time))
+    future_forecasts = fetch_future_forecasts()
+    if future_forecasts is None or len(future_forecasts) == 0:
+        return "Could not fetch future forecasts"
+    registry_revision = None
+    try:
+        registry_path = hf_hub_download(
+            repo_id=DATASET_NAME,
+            filename="model_registry.json",
+            repo_type="dataset",
+            token=HF_TOKEN,
+        )
+        with open(registry_path, "r") as handle:
+            registry = json.load(handle)
+        registry_revision = registry.get("generated_at")
+        if registry_revision:
+            clear_model_bundle_cache(registry_revision)
+    except Exception as exc:
+        log_exception("generate_predictions model_registry", exc)
+    feature_frame = build_model_features(future_forecasts)
+    results = feature_frame.copy()
+    results["prediction_made_at"] = current_time
+    results["city"] = "aarhus"
+    results["verified"] = False
+    results["actual_temp"] = None
+    results["actual_wind_speed"] = None
+    results["actual_wind_gust"] = None
+    results["actual_precipitation"] = None
+    results["actual_rain"] = None
+    results["actual_rain_event"] = None
+    results["actual_rain_amount"] = None
+    results["ml_temp"] = results["dmi_temperature_2m_pred"]
+    results["ml_wind_speed"] = results["dmi_windspeed_10m_pred"]
+    results["ml_wind_gust"] = results["dmi_windgusts_10m_pred"]
+    results["ml_rain_prob"] = results["dmi_precipitation_probability_pred"].fillna(0.0).clip(0.0, 100.0) / 100.0
+    results["ml_rain_amount"] = results["dmi_precipitation_pred"].fillna(0.0).clip(0.0, None)
+    target_specs = [
+        ("temperature", "ml_temp", "dmi_temperature_2m_pred", True),
+        ("wind_speed", "ml_wind_speed", "dmi_windspeed_10m_pred", True),
+        ("wind_gust", "ml_wind_gust", "dmi_windgusts_10m_pred", True),
+        ("rain_event", "ml_rain_prob", None, False),
+        ("rain_amount", "ml_rain_amount", None, False),
+    ]
+    for target_name, output_col, baseline_col, is_correction in target_specs:
+        bundle = load_model_bundle(target_name, cache_revision=registry_revision)
+        target_pred = predict_with_bundle(bundle, feature_frame)
+        if target_pred is None:
+            continue
+        target_series = pd.Series(target_pred, index=results.index, dtype="float64")
+        target_mask = target_series.notna()
+        if not target_mask.any():
+            continue
+        if is_correction:
+            results.loc[target_mask, output_col] = results.loc[target_mask, baseline_col] + target_series[target_mask]
+        else:
+            results.loc[target_mask, output_col] = target_series[target_mask]
+    results["ml_rain_prob"] = results["ml_rain_prob"].clip(0.0, 1.0)
+    results["ml_rain_amount"] = results["ml_rain_amount"].clip(0.0, None)
+    results = merge_prediction_history(load_predictions_snapshot(), results)
+    results.to_parquet("predictions_latest.parquet")
+    if upload_to_dataset("predictions_latest.parquet", "predictions_latest.parquet", PREDICTIONS_DATASET):
+        future_count = int((results["target_timestamp"] > current_time).sum())
+        verified_count = int(results["verified"].fillna(False).astype(bool).sum())
+        return (
+            f"Generated/upserted {len(feature_frame)} future predictions. "
+            f"Dataset now holds {len(results)} rows, including {future_count} future rows "
+            f"and {verified_count} verified rows."
+        )
+    return "Failed to upload predictions"
+def verify_predictions():
+    """Verify past predictions with actual observations."""
+    log_event("verify_predictions entered")
+    try:
+        pred_df = load_predictions_snapshot()
+        if pred_df is None or len(pred_df) == 0:
+            return "No predictions file found"
+        now = now_cph()
+        to_verify = pred_df[
+            (~pred_df["verified"]) &
+            (pred_df["target_timestamp"] < now - timedelta(hours=1))
+        ]
+        if len(to_verify) == 0:
+            return "No predictions to verify"
+        start_date = to_verify["target_timestamp"].min().date()
+        end_date = to_verify["target_timestamp"].max().date()
+        observations = fetch_observations_for_period(start_date, end_date)
+        if observations is None or len(observations) == 0:
+            return "Could not fetch observations"
+        observation_cols = [
+            "actual_temp",
+            "actual_wind_speed",
+            "actual_wind_gust",
+            "actual_precipitation",
+            "actual_rain",
+            "rain_event",
+            "rain_amount",
+        ]
+        lookup = observations.set_index("target_timestamp")[observation_cols]
+        verified_count = 0
+        for idx, row in to_verify.iterrows():
+            target_timestamp = row["target_timestamp"]
+            if target_timestamp not in lookup.index:
+                continue
+            match = lookup.loc[target_timestamp]
+            pred_df.loc[idx, "actual_temp"] = match["actual_temp"]
+            pred_df.loc[idx, "actual_wind_speed"] = match["actual_wind_speed"]
+            pred_df.loc[idx, "actual_wind_gust"] = match["actual_wind_gust"]
+            pred_df.loc[idx, "actual_precipitation"] = match["actual_precipitation"]
+            pred_df.loc[idx, "actual_rain"] = match["actual_rain"]
+            pred_df.loc[idx, "actual_rain_event"] = match["rain_event"]
+            pred_df.loc[idx, "actual_rain_amount"] = match["rain_amount"]
+            pred_df.loc[idx, "verified"] = True
+            verified_count += 1
+        pred_df = normalize_prediction_df(pred_df)
+        pred_df.to_parquet("predictions_latest.parquet")
+        if upload_to_dataset("predictions_latest.parquet", "predictions_latest.parquet", PREDICTIONS_DATASET):
+            return f"Verified {verified_count} predictions"
+        return "Failed to upload verified predictions"
+    except Exception as exc:
+        log_exception("verify_predictions", exc)
+        return f"Verification error: {exc}"
 def load_predictions_snapshot():
     pred_path, _ = load_first_available_dataset_file(
         ["predictions_latest.parquet", "predictions.parquet"],
     if not pred_path:
         return None
+    pred_df = normalize_prediction_df(pd.read_parquet(pred_path))
+    if pred_df is None or 'target_timestamp' not in pred_df.columns:
         return None
     return pred_df
+def build_collector_snapshot_text():
+    """Summarize stored training and prediction data for the collector landing view."""
+    lines = []
+    try:
+        training_df, training_name = load_existing_training_matrix()
+        if training_df is None or len(training_df) == 0:
+            lines.append("Training data: no rows available.")
+        else:
+            latest_training = training_df["target_timestamp"].max()
+            lines.append(f"Training data ({training_name}): {len(training_df)} rows through {latest_training}.")
+            if "lead_bucket" in training_df.columns:
+                bucket_counts = training_df["lead_bucket"].value_counts().sort_index().to_dict()
+                bucket_text = ", ".join(f"{bucket}={count}" for bucket, count in bucket_counts.items())
+                lines.append(f"Lead buckets: {bucket_text}")
+    except Exception as exc:
+        lines.append(f"Training data summary unavailable: {exc}")
+    try:
+        pred_df = load_predictions_snapshot()
+        if pred_df is None or len(pred_df) == 0:
+            lines.append("Predictions: no rows available.")
+        else:
+            now = now_cph()
+            future_count = int((pred_df["target_timestamp"] > now).sum())
+            verified_count = int(pred_df["verified"].fillna(False).astype(bool).sum())
+            latest_prediction = pred_df["prediction_made_at"].max() if "prediction_made_at" in pred_df.columns else "unknown"
+            lines.append(f"Predictions: {len(pred_df)} rows, {future_count} future, {verified_count} verified.")
+            lines.append(f"Latest prediction made at: {latest_prediction}")
+    except Exception as exc:
+        lines.append(f"Prediction summary unavailable: {exc}")
+    return "\n".join(lines)
+def build_collector_load_outputs():
+    return build_app_status_text(), build_collector_snapshot_text()
+def build_action_status(result):
+    return f"{result}\n\n{build_collector_snapshot_text()}"
 def should_run_daily_catch_up():
     matrix_path = load_from_dataset("training_matrix.parquet", DATASET_NAME)
     if not matrix_path:
 def run_scheduler():
     """Background scheduler for automated tasks."""
     log_event("scheduler starting")
     schedule.every(3).hours.do(lambda: run_logged("scheduled_generate_predictions", generate_predictions))
     schedule.every().hour.do(lambda: run_logged("scheduled_verify_predictions", verify_predictions))
     schedule.every().day.at("06:00").do(lambda: run_logged("scheduled_update_daily", update_daily))
     log_event("scheduler_registered")
     """)
     app_status = gr.Markdown(build_app_status_text())
+    status = gr.Textbox(label="Status", lines=10, value="Loading collector snapshot...")
     with gr.Row():
         btn_backfill = gr.Button("🚀 Backfill Historical Data", variant="primary")
         try:
             result = run_logged("gradio_backfill_historical_data", backfill_historical_data)
             set_app_ready()
+            return build_app_status_text(), build_action_status(result)
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ backfill_historical_data failed: {exc}"
         try:
             result = run_logged("gradio_update_daily", update_daily)
             set_app_ready()
+            return build_app_status_text(), build_action_status(result)
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ update_daily failed: {exc}"
         try:
             result = run_logged("gradio_generate_predictions", generate_predictions)
             set_app_ready()
+            return build_app_status_text(), build_action_status(result)
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ generate_predictions failed: {exc}"
         try:
             result = run_logged("gradio_verify_predictions", verify_predictions)
             set_app_ready()
+            return build_app_status_text(), build_action_status(result)
         except Exception as exc:
             note_app_error(exc)
             return build_app_status_text(), f"❌ verify_predictions failed: {exc}"
     btn_daily.click(daily_handler, outputs=[app_status, status])
     btn_predict.click(predict_handler, outputs=[app_status, status])
     btn_verify.click(verify_handler, outputs=[app_status, status])
+    demo.load(build_collector_load_outputs, outputs=[app_status, status])
 log_event("gradio_ui_ready")
 log_event("ui_constructed")