Spaces:

Ciroc0
/

dmi-collector

Running

App Files Files Community

Ciroc0 commited on Mar 8

Commit

9646a5d

verified ·

1 Parent(s): 45f82d6

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -542

app.py DELETED Viewed

@@ -1,542 +0,0 @@
-import gradio as gr
-import requests
-import pandas as pd
-import numpy as np
-from datetime import datetime, timedelta
-from datasets import load_dataset
-from huggingface_hub import HfApi, hf_hub_download
-import schedule
-import time
-import threading
-import os
-import joblib
-from zoneinfo import ZoneInfo
-DATASET_NAME = "Ciroc0/dmi-aarhus-weather-data"
-PREDICTIONS_DATASET = "Ciroc0/dmi-aarhus-predictions"
-AARHUS_LAT = 56.1567
-AARHUS_LON = 10.2108
-HF_TOKEN = os.environ.get("HF_TOKEN")
-COPENHAGEN_TZ = ZoneInfo("Europe/Copenhagen")
-def now_cph():
-    return datetime.now(COPENHAGEN_TZ)
-def fetch_forecasts_for_period(start_date, end_date):
-    all_forecasts = []
-    run_hours = [0, 3, 6, 9, 12, 15, 18, 21]
-    current_date = start_date
-    cph_now = now_cph()
-    while current_date <= end_date:
-        for hour in run_hours:
-            reference_time = datetime.combine(current_date, datetime.min.time()) + timedelta(hours=hour)
-            reference_time = reference_time.replace(tzinfo=COPENHAGEN_TZ)
-            if reference_time > cph_now:
-                continue
-            url = "https://api.open-meteo.com/v1/forecast"
-            params = {
-                "latitude": AARHUS_LAT,
-                "longitude": AARHUS_LON,
-                "start_date": current_date.strftime("%Y-%m-%d"),
-                "end_date": (current_date + timedelta(days=2)).strftime("%Y-%m-%d"),
-                "models": "dmi_harmonie",
-                "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
-                "timezone": "Europe/Copenhagen"
-            }
-            try:
-                resp = requests.get(url, params=params, timeout=30)
-                if resp.status_code != 200:
-                    del params['models']
-                    resp = requests.get(url, params=params, timeout=30)
-                if resp.status_code == 200:
-                    data = resp.json()
-                    if 'hourly' in data:
-                        times = pd.to_datetime(data['hourly']['time'])
-                        times = times.tz_localize('Europe/Copenhagen', ambiguous='infer')
-                        for i, target_time in enumerate(times):
-                            lead_hours = (target_time - reference_time).total_seconds() / 3600
-                            if 0 < lead_hours <= 48:
-                                all_forecasts.append({
-                                    'timestamp': target_time,
-                                    'reference_time': reference_time,
-                                    'lead_time_hours': int(lead_hours),
-                                    'dmi_temp_pred': data['hourly']['temperature_2m'][i],
-                                    'dmi_wind_pred': data['hourly']['windspeed_10m'][i],
-                                    'dmi_pressure_pred': data['hourly']['pressure_msl'][i],
-                                    'dmi_humidity_pred': data['hourly']['relativehumidity_2m'][i]
-                                })
-            except Exception as e:
-                print(f"Fejl: {e}")
-                continue
-        current_date += timedelta(days=1)
-        time.sleep(0.1)
-    if not all_forecasts:
-        return None
-    df = pd.DataFrame(all_forecasts)
-    # VIKTIGT: Drop duplicates baseret på timestamp (target tid), ikke reference_time!
-    # reference_time er ens for alle 48 timer i samme forecast
-    df = df.drop_duplicates(subset=['timestamp'], keep='first')
-    df = df.sort_values('timestamp').reset_index(drop=True)
-    return df
-def fetch_actuals_for_period(start_date, end_date):
-    url = "https://archive-api.open-meteo.com/v1/archive"
-    cph_today = now_cph().date()
-    if end_date > cph_today:
-        end_date = cph_today
-    params = {
-        "latitude": AARHUS_LAT,
-        "longitude": AARHUS_LON,
-        "start_date": start_date.strftime("%Y-%m-%d"),
-        "end_date": end_date.strftime("%Y-%m-%d"),
-        "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
-        "timezone": "Europe/Copenhagen"
-    }
-    try:
-        resp = requests.get(url, params=params, timeout=60)
-        if resp.status_code != 200:
-            return None
-        data = resp.json()
-        if 'hourly' not in data:
-            return None
-        timestamps = pd.to_datetime(data['hourly']['time'])
-        timestamps = timestamps.tz_localize('Europe/Copenhagen', ambiguous='infer')
-        actuals_df = pd.DataFrame({
-            'timestamp': timestamps,
-            'actual_temp': data['hourly']['temperature_2m'],
-            'actual_wind': data['hourly']['windspeed_10m'],
-            'actual_pressure': data['hourly']['pressure_msl'],
-            'actual_humidity': data['hourly']['relativehumidity_2m']
-        })
-        # Filtrer fremtidige timer væk: behold kun observationer op til nuværende time
-        current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
-        actuals_df = actuals_df[actuals_df['timestamp'] <= current_hour]
-        return actuals_df
-    except Exception as e:
-        print(f"❌ Fejl: {e}")
-        return None
-def fetch_future_forecasts():
-    """Henter fremtidige forecasts - 48 timer frem"""
-    now = now_cph()
-    today = now.date()
-    current_hour = now.hour
-    run_hours = [0, 3, 6, 9, 12, 15, 18, 21]
-    latest_run = max([h for h in run_hours if h <= current_hour], default=0)
-    reference_time = datetime.combine(today, datetime.min.time()) + timedelta(hours=latest_run)
-    reference_time = reference_time.replace(tzinfo=COPENHAGEN_TZ)
-    # Hent 3 dage frem for at sikre vi har 48 timer dækket
-    url = "https://api.open-meteo.com/v1/forecast"
-    params = {
-        "latitude": AARHUS_LAT,
-        "longitude": AARHUS_LON,
-        "start_date": today.strftime("%Y-%m-%d"),
-        "end_date": (today + timedelta(days=3)).strftime("%Y-%m-%d"),
-        "models": "dmi_harmonie",
-        "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
-        "timezone": "Europe/Copenhagen"
-    }
-    try:
-        resp = requests.get(url, params=params, timeout=30)
-        if resp.status_code != 200:
-            del params['models']
-            resp = requests.get(url, params=params, timeout=30)
-        if resp.status_code != 200:
-            return None
-        data = resp.json()
-        if 'hourly' not in data:
-            return None
-        times = pd.to_datetime(data['hourly']['time'])
-        times = times.tz_localize('Europe/Copenhagen', ambiguous='infer')
-        forecasts = []
-        for i, target_time in enumerate(times):
-            # Kun fremtidige tidspunkter
-            if target_time > now:
-                lead_hours = (target_time - reference_time).total_seconds() / 3600
-                # Op til 48 timer frem
-                if 0 < lead_hours <= 48:
-                    forecasts.append({
-                        'timestamp': target_time,
-                        'reference_time': reference_time,
-                        'lead_time_hours': int(lead_hours),
-                        'dmi_temp_pred': data['hourly']['temperature_2m'][i],
-                        'dmi_wind_pred': data['hourly']['windspeed_10m'][i],
-                        'dmi_pressure_pred': data['hourly']['pressure_msl'][i],
-                        'dmi_humidity_pred': data['hourly']['relativehumidity_2m'][i]
-                    })
-        if not forecasts:
-            return None
-        df = pd.DataFrame(forecasts)
-        # Drop duplicates baseret på timestamp (target tid), ikke reference_time!
-        df = df.drop_duplicates(subset=['timestamp'], keep='first')
-        df = df.sort_values('timestamp').reset_index(drop=True)
-        print(f"✅ Hentede {len(df)} forecasts fra {df['timestamp'].min()} til {df['timestamp'].max()}")
-        return df
-    except Exception as e:
-        print(f"❌ Fejl: {e}")
-        return None
-def get_features_for_prediction(row):
-    ts = row['reference_time']
-    if hasattr(ts, 'tzinfo') and ts.tzinfo is not None:
-        ts_naive = ts.replace(tzinfo=None)
-    else:
-        ts_naive = ts
-    hour = ts_naive.hour
-    month = ts_naive.month
-    day_of_year = ts_naive.timetuple().tm_yday
-    return {
-        'dmi_temp_pred': row['dmi_temp_pred'],
-        'dmi_wind_pred': row['dmi_wind_pred'],
-        'dmi_pressure_pred': row['dmi_pressure_pred'],
-        'dmi_humidity_pred': row['dmi_humidity_pred'],
-        'hour_sin': np.sin(2 * np.pi * hour / 24),
-        'hour_cos': np.cos(2 * np.pi * hour / 24),
-        'month_sin': np.sin(2 * np.pi * month / 12),
-        'month_cos': np.cos(2 * np.pi * month / 12),
-        'hour': hour,
-        'day_of_year': day_of_year
-    }
-def load_model():
-    try:
-        model_path = hf_hub_download(
-            repo_id=DATASET_NAME,
-            filename="xgb_model.pkl",
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        return joblib.load(model_path)
-    except Exception as e:
-        print(f"❌ Kunne ikke loade model: {e}")
-        return None
-def generate_ml_predictions(forecasts_df):
-    model = load_model()
-    if model is None:
-        return None
-    feature_cols = [
-        'dmi_temp_pred', 'dmi_wind_pred', 'dmi_pressure_pred', 'dmi_humidity_pred',
-        'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
-        'hour', 'day_of_year'
-    ]
-    features = []
-    for _, row in forecasts_df.iterrows():
-        feat = get_features_for_prediction(row)
-        features.append(feat)
-    X = pd.DataFrame(features)
-    corrections = model.predict(X[feature_cols])
-    forecasts_df = forecasts_df.copy()
-    forecasts_df['ml_pred'] = forecasts_df['dmi_temp_pred'] + corrections
-    return forecasts_df
-def backfill_historical_data():
-    start_date = datetime(2025, 11, 1).date()
-    end_date = now_cph().date()
-    print(f"🔄 Henter fra {start_date} til {end_date}")
-    all_data = []
-    current_month_start = start_date
-    while current_month_start <= end_date:
-        if current_month_start.month == 12:
-            next_month = datetime(current_month_start.year + 1, 1, 1).date()
-        else:
-            next_month = datetime(current_month_start.year, current_month_start.month + 1, 1).date()
-        month_end = min(next_month - timedelta(days=1), end_date)
-        print(f"🔄 Henter {current_month_start.strftime('%Y-%m')}...")
-        forecasts = fetch_forecasts_for_period(current_month_start, month_end)
-        if forecasts is not None and len(forecasts) > 0:
-            min_target = forecasts['timestamp'].min().date()
-            max_target = forecasts['timestamp'].max().date()
-            actuals = fetch_actuals_for_period(
-                min_target - timedelta(days=2),
-                max_target + timedelta(days=2)
-            )
-            if actuals is not None:
-                merged = pd.merge(forecasts, actuals, on='timestamp', how='inner')
-                if len(merged) > 0:
-                    merged['hour'] = merged['reference_time'].dt.hour
-                    merged['day_of_year'] = merged['reference_time'].dt.dayofyear
-                    merged['month'] = merged['reference_time'].dt.month
-                    merged['hour_sin'] = np.sin(2 * np.pi * merged['hour'] / 24)
-                    merged['hour_cos'] = np.cos(2 * np.pi * merged['hour'] / 24)
-                    merged['month_sin'] = np.sin(2 * np.pi * merged['month'] / 12)
-                    merged['month_cos'] = np.cos(2 * np.pi * merged['month'] / 12)
-                    merged['dmi_error'] = merged['actual_temp'] - merged['dmi_temp_pred']
-                    all_data.append(merged)
-                    print(f"✅ {len(merged)} rækker")
-        current_month_start = next_month
-    if not all_data:
-        return "❌ Ingen data"
-    final_df = pd.concat(all_data, ignore_index=True)
-    # Fjern fremtidige tider: behold kun rækker hvor timestamp er mindre eller lig med nuværende time
-    current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
-    final_df = final_df[final_df['timestamp'] <= current_hour]
-    # Drop duplicates baseret på timestamp (target tid)
-    final_df = final_df.drop_duplicates(subset=['timestamp'], keep='first')
-    try:
-        final_df.to_parquet("data.parquet")
-        api = HfApi()
-        api.upload_file(
-            path_or_fileobj="data.parquet",
-            path_in_repo="data.parquet",
-            repo_id=DATASET_NAME,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        return f"✅ {len(final_df)} rækker med timestamp som nøgle"
-    except Exception as e:
-        return f"❌ Fejl: {str(e)}"
-def update_daily():
-    end_date = now_cph().date()
-    start_date = end_date - timedelta(days=7)
-    print(f"⏰ København tid: {now_cph()}")
-    forecasts = fetch_forecasts_for_period(start_date, end_date)
-    if forecasts is None:
-        return "❌ Ingen forecasts"
-    min_target = forecasts['timestamp'].min().date()
-    max_target = forecasts['timestamp'].max().date()
-    actuals = fetch_actuals_for_period(min_target - timedelta(days=2), max_target)
-    if actuals is None:
-        return "❌ Ingen actuals"
-    merged = pd.merge(forecasts, actuals, on='timestamp', how='inner')
-    if len(merged) == 0:
-        return "❌ Ingen match"
-    # Fjern fremtidige tider: behold kun rækker hvor timestamp er mindre eller lig med nuværende time
-    current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
-    merged = merged[merged['timestamp'] <= current_hour]
-    merged['hour'] = merged['reference_time'].dt.hour
-    merged['day_of_year'] = merged['reference_time'].dt.dayofyear
-    merged['month'] = merged['reference_time'].dt.month
-    merged['hour_sin'] = np.sin(2 * np.pi * merged['hour'] / 24)
-    merged['hour_cos'] = np.cos(2 * np.pi * merged['hour'] / 24)
-    merged['month_sin'] = np.sin(2 * np.pi * merged['month'] / 12)
-    merged['month_cos'] = np.cos(2 * np.pi * merged['month'] / 12)
-    merged['dmi_error'] = merged['actual_temp'] - merged['dmi_temp_pred']
-    try:
-        dataset = load_dataset(DATASET_NAME, split="train")
-        existing = dataset.to_pandas()
-        if 'timestamp' not in existing.columns:
-            return "❌ Eksisterende data mangler timestamp kolonne"
-        if existing['timestamp'].dt.tz is None:
-            existing['timestamp'] = existing['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
-        else:
-            existing['timestamp'] = existing['timestamp'].dt.tz_convert('Europe/Copenhagen')
-        # Fjern dubletter baseret på timestamp (target tid)
-        existing_ts = set(existing['timestamp'])
-        mask = ~merged['timestamp'].isin(existing_ts)
-        new_data = merged[mask]
-        if len(new_data) == 0:
-            return "ℹ️ Ingen nye data"
-        combined = pd.concat([existing, new_data], ignore_index=True)
-        # Sikr ingen duplicates i combined
-        combined = combined.drop_duplicates(subset=['timestamp'], keep='first')
-        status_msg = f"✅ {len(new_data)} nye rækker tilføjet"
-    except Exception as e:
-        print(f"Info: {e}")
-        combined = merged
-        status_msg = f"✅ {len(merged)} rækker gemt (nyt datasæt)"
-    combined.to_parquet("data.parquet")
-    api = HfApi()
-    api.upload_file(path_or_fileobj="data.parquet", path_in_repo="data.parquet",
-                   repo_id=DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
-    return status_msg
-def update_predictions():
-    current_time = now_cph()
-    print(f"🔮 Genererer live predictions: {current_time}")
-    future_forecasts = fetch_future_forecasts()
-    if future_forecasts is None or len(future_forecasts) == 0:
-        return "❌ Kunne ikke hente fremtidige forecasts"
-    predictions = generate_ml_predictions(future_forecasts)
-    if predictions is None:
-        return "❌ Kunne ikke loade model"
-    predictions['prediction_made_at'] = current_time
-    predictions['city'] = 'aarhus'
-    predictions['verified'] = False
-    predictions['actual_temp'] = None
-    try:
-        dataset = load_dataset(PREDICTIONS_DATASET, split="train")
-        existing = dataset.to_pandas()
-        if 'timestamp' in existing.columns:
-            if existing['timestamp'].dt.tz is None:
-                existing['timestamp'] = existing['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
-            # Fjern duplicates baseret på timestamp (target tidspunkt)
-            # Hver target tid skal kun have én prediction
-            new_timestamps = set(predictions['timestamp'])
-            existing = existing[~existing['timestamp'].isin(new_timestamps)]
-            combined = pd.concat([existing, predictions], ignore_index=True)
-            # Drop duplicates igen for sikkerheds skyld
-            combined = combined.drop_duplicates(subset=['timestamp'], keep='first')
-        else:
-            combined = predictions
-    except:
-        combined = predictions
-    try:
-        combined.to_parquet("predictions.parquet")
-        api = HfApi()
-        api.upload_file(
-            path_or_fileobj="predictions.parquet",
-            path_in_repo="predictions.parquet",
-            repo_id=PREDICTIONS_DATASET,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        return f"✅ {len(predictions)} nye predictions gemt ({predictions['timestamp'].min()} til {predictions['timestamp'].max()})"
-    except Exception as e:
-        return f"❌ Fejl: {str(e)}"
-def verify_past_predictions():
-    try:
-        dataset = load_dataset(PREDICTIONS_DATASET, split="train")
-        pred_df = dataset.to_pandas()
-        if 'timestamp' not in pred_df.columns:
-            return "❌ Ingen timestamp kolonne"
-        if pred_df['timestamp'].dt.tz is None:
-            pred_df['timestamp'] = pred_df['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
-        now = now_cph()
-        to_verify = pred_df[
-            (~pred_df['verified']) &
-            (pred_df['timestamp'] < now - timedelta(hours=1))
-        ]
-        if len(to_verify) == 0:
-            return "Ingen at verificere"
-        start_date = to_verify['timestamp'].min().date()
-        end_date = to_verify['timestamp'].max().date()
-        actuals = fetch_actuals_for_period(start_date, end_date)
-        if actuals is None:
-            return "Kunne ikke hente actuals"
-        for idx, row in to_verify.iterrows():
-            match = actuals[actuals['timestamp'] == row['timestamp']]
-            if len(match) > 0:
-                pred_df.loc[idx, 'actual_temp'] = match.iloc[0]['actual_temp']
-                pred_df.loc[idx, 'verified'] = True
-        pred_df.to_parquet("predictions.parquet")
-        api = HfApi()
-        api.upload_file(
-            path_or_fileobj="predictions.parquet",
-            path_in_repo="predictions.parquet",
-            repo_id=PREDICTIONS_DATASET,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        return f"{len(to_verify)} verificeret"
-    except Exception as e:
-        return f"Verificeringsfejl: {e}"
-def run_scheduler():
-    schedule.every().day.at("06:00").do(update_daily)
-    while True:
-        schedule.run_pending()
-        time.sleep(60)
-scheduler_thread = threading.Thread(target=run_scheduler)
-scheduler_thread.daemon = True
-scheduler_thread.start()
-with gr.Blocks(title="DMI Collector + Live Predictions") as demo:
-    gr.Markdown("""
-    # 🌤️ DMI Data Collector + Live Predictions
-    """)
-    status = gr.Textbox(label="Status", lines=10)
-    with gr.Row():
-        btn_backfill = gr.Button("🚀 Hent historisk data", variant="primary")
-        btn_daily = gr.Button("🔄 Opdater træningsdata", variant="secondary")
-        btn_predict = gr.Button("🔮 Generér Live Predictions NU", variant="primary")
-    btn_backfill.click(backfill_historical_data, outputs=status)
-    btn_daily.click(update_daily, outputs=status)
-    btn_predict.click(update_predictions, outputs=status)
-demo.launch()