Ciroc0 commited on
Commit
9646a5d
·
verified ·
1 Parent(s): 45f82d6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -542
app.py DELETED
@@ -1,542 +0,0 @@
1
- import gradio as gr
2
- import requests
3
- import pandas as pd
4
- import numpy as np
5
- from datetime import datetime, timedelta
6
- from datasets import load_dataset
7
- from huggingface_hub import HfApi, hf_hub_download
8
- import schedule
9
- import time
10
- import threading
11
- import os
12
- import joblib
13
- from zoneinfo import ZoneInfo
14
-
15
- DATASET_NAME = "Ciroc0/dmi-aarhus-weather-data"
16
- PREDICTIONS_DATASET = "Ciroc0/dmi-aarhus-predictions"
17
- AARHUS_LAT = 56.1567
18
- AARHUS_LON = 10.2108
19
- HF_TOKEN = os.environ.get("HF_TOKEN")
20
-
21
- COPENHAGEN_TZ = ZoneInfo("Europe/Copenhagen")
22
-
23
- def now_cph():
24
- return datetime.now(COPENHAGEN_TZ)
25
-
26
- def fetch_forecasts_for_period(start_date, end_date):
27
- all_forecasts = []
28
- run_hours = [0, 3, 6, 9, 12, 15, 18, 21]
29
-
30
- current_date = start_date
31
- cph_now = now_cph()
32
-
33
- while current_date <= end_date:
34
- for hour in run_hours:
35
- reference_time = datetime.combine(current_date, datetime.min.time()) + timedelta(hours=hour)
36
- reference_time = reference_time.replace(tzinfo=COPENHAGEN_TZ)
37
-
38
- if reference_time > cph_now:
39
- continue
40
-
41
- url = "https://api.open-meteo.com/v1/forecast"
42
- params = {
43
- "latitude": AARHUS_LAT,
44
- "longitude": AARHUS_LON,
45
- "start_date": current_date.strftime("%Y-%m-%d"),
46
- "end_date": (current_date + timedelta(days=2)).strftime("%Y-%m-%d"),
47
- "models": "dmi_harmonie",
48
- "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
49
- "timezone": "Europe/Copenhagen"
50
- }
51
-
52
- try:
53
- resp = requests.get(url, params=params, timeout=30)
54
- if resp.status_code != 200:
55
- del params['models']
56
- resp = requests.get(url, params=params, timeout=30)
57
-
58
- if resp.status_code == 200:
59
- data = resp.json()
60
- if 'hourly' in data:
61
- times = pd.to_datetime(data['hourly']['time'])
62
- times = times.tz_localize('Europe/Copenhagen', ambiguous='infer')
63
-
64
- for i, target_time in enumerate(times):
65
- lead_hours = (target_time - reference_time).total_seconds() / 3600
66
-
67
- if 0 < lead_hours <= 48:
68
- all_forecasts.append({
69
- 'timestamp': target_time,
70
- 'reference_time': reference_time,
71
- 'lead_time_hours': int(lead_hours),
72
- 'dmi_temp_pred': data['hourly']['temperature_2m'][i],
73
- 'dmi_wind_pred': data['hourly']['windspeed_10m'][i],
74
- 'dmi_pressure_pred': data['hourly']['pressure_msl'][i],
75
- 'dmi_humidity_pred': data['hourly']['relativehumidity_2m'][i]
76
- })
77
- except Exception as e:
78
- print(f"Fejl: {e}")
79
- continue
80
-
81
- current_date += timedelta(days=1)
82
- time.sleep(0.1)
83
-
84
- if not all_forecasts:
85
- return None
86
-
87
- df = pd.DataFrame(all_forecasts)
88
- # VIKTIGT: Drop duplicates baseret på timestamp (target tid), ikke reference_time!
89
- # reference_time er ens for alle 48 timer i samme forecast
90
- df = df.drop_duplicates(subset=['timestamp'], keep='first')
91
- df = df.sort_values('timestamp').reset_index(drop=True)
92
- return df
93
-
94
- def fetch_actuals_for_period(start_date, end_date):
95
- url = "https://archive-api.open-meteo.com/v1/archive"
96
-
97
- cph_today = now_cph().date()
98
- if end_date > cph_today:
99
- end_date = cph_today
100
-
101
- params = {
102
- "latitude": AARHUS_LAT,
103
- "longitude": AARHUS_LON,
104
- "start_date": start_date.strftime("%Y-%m-%d"),
105
- "end_date": end_date.strftime("%Y-%m-%d"),
106
- "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
107
- "timezone": "Europe/Copenhagen"
108
- }
109
-
110
- try:
111
- resp = requests.get(url, params=params, timeout=60)
112
- if resp.status_code != 200:
113
- return None
114
-
115
- data = resp.json()
116
- if 'hourly' not in data:
117
- return None
118
-
119
- timestamps = pd.to_datetime(data['hourly']['time'])
120
- timestamps = timestamps.tz_localize('Europe/Copenhagen', ambiguous='infer')
121
-
122
- actuals_df = pd.DataFrame({
123
- 'timestamp': timestamps,
124
- 'actual_temp': data['hourly']['temperature_2m'],
125
- 'actual_wind': data['hourly']['windspeed_10m'],
126
- 'actual_pressure': data['hourly']['pressure_msl'],
127
- 'actual_humidity': data['hourly']['relativehumidity_2m']
128
- })
129
-
130
- # Filtrer fremtidige timer væk: behold kun observationer op til nuværende time
131
- current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
132
- actuals_df = actuals_df[actuals_df['timestamp'] <= current_hour]
133
- return actuals_df
134
- except Exception as e:
135
- print(f"❌ Fejl: {e}")
136
- return None
137
-
138
- def fetch_future_forecasts():
139
- """Henter fremtidige forecasts - 48 timer frem"""
140
- now = now_cph()
141
- today = now.date()
142
-
143
- current_hour = now.hour
144
- run_hours = [0, 3, 6, 9, 12, 15, 18, 21]
145
- latest_run = max([h for h in run_hours if h <= current_hour], default=0)
146
-
147
- reference_time = datetime.combine(today, datetime.min.time()) + timedelta(hours=latest_run)
148
- reference_time = reference_time.replace(tzinfo=COPENHAGEN_TZ)
149
-
150
- # Hent 3 dage frem for at sikre vi har 48 timer dækket
151
- url = "https://api.open-meteo.com/v1/forecast"
152
- params = {
153
- "latitude": AARHUS_LAT,
154
- "longitude": AARHUS_LON,
155
- "start_date": today.strftime("%Y-%m-%d"),
156
- "end_date": (today + timedelta(days=3)).strftime("%Y-%m-%d"),
157
- "models": "dmi_harmonie",
158
- "hourly": ["temperature_2m", "windspeed_10m", "pressure_msl", "relativehumidity_2m"],
159
- "timezone": "Europe/Copenhagen"
160
- }
161
-
162
- try:
163
- resp = requests.get(url, params=params, timeout=30)
164
- if resp.status_code != 200:
165
- del params['models']
166
- resp = requests.get(url, params=params, timeout=30)
167
-
168
- if resp.status_code != 200:
169
- return None
170
-
171
- data = resp.json()
172
- if 'hourly' not in data:
173
- return None
174
-
175
- times = pd.to_datetime(data['hourly']['time'])
176
- times = times.tz_localize('Europe/Copenhagen', ambiguous='infer')
177
-
178
- forecasts = []
179
-
180
- for i, target_time in enumerate(times):
181
- # Kun fremtidige tidspunkter
182
- if target_time > now:
183
- lead_hours = (target_time - reference_time).total_seconds() / 3600
184
-
185
- # Op til 48 timer frem
186
- if 0 < lead_hours <= 48:
187
- forecasts.append({
188
- 'timestamp': target_time,
189
- 'reference_time': reference_time,
190
- 'lead_time_hours': int(lead_hours),
191
- 'dmi_temp_pred': data['hourly']['temperature_2m'][i],
192
- 'dmi_wind_pred': data['hourly']['windspeed_10m'][i],
193
- 'dmi_pressure_pred': data['hourly']['pressure_msl'][i],
194
- 'dmi_humidity_pred': data['hourly']['relativehumidity_2m'][i]
195
- })
196
-
197
- if not forecasts:
198
- return None
199
-
200
- df = pd.DataFrame(forecasts)
201
- # Drop duplicates baseret på timestamp (target tid), ikke reference_time!
202
- df = df.drop_duplicates(subset=['timestamp'], keep='first')
203
- df = df.sort_values('timestamp').reset_index(drop=True)
204
-
205
- print(f"✅ Hentede {len(df)} forecasts fra {df['timestamp'].min()} til {df['timestamp'].max()}")
206
- return df
207
-
208
- except Exception as e:
209
- print(f"❌ Fejl: {e}")
210
- return None
211
-
212
- def get_features_for_prediction(row):
213
- ts = row['reference_time']
214
- if hasattr(ts, 'tzinfo') and ts.tzinfo is not None:
215
- ts_naive = ts.replace(tzinfo=None)
216
- else:
217
- ts_naive = ts
218
-
219
- hour = ts_naive.hour
220
- month = ts_naive.month
221
- day_of_year = ts_naive.timetuple().tm_yday
222
-
223
- return {
224
- 'dmi_temp_pred': row['dmi_temp_pred'],
225
- 'dmi_wind_pred': row['dmi_wind_pred'],
226
- 'dmi_pressure_pred': row['dmi_pressure_pred'],
227
- 'dmi_humidity_pred': row['dmi_humidity_pred'],
228
- 'hour_sin': np.sin(2 * np.pi * hour / 24),
229
- 'hour_cos': np.cos(2 * np.pi * hour / 24),
230
- 'month_sin': np.sin(2 * np.pi * month / 12),
231
- 'month_cos': np.cos(2 * np.pi * month / 12),
232
- 'hour': hour,
233
- 'day_of_year': day_of_year
234
- }
235
-
236
- def load_model():
237
- try:
238
- model_path = hf_hub_download(
239
- repo_id=DATASET_NAME,
240
- filename="xgb_model.pkl",
241
- repo_type="dataset",
242
- token=HF_TOKEN
243
- )
244
- return joblib.load(model_path)
245
- except Exception as e:
246
- print(f"❌ Kunne ikke loade model: {e}")
247
- return None
248
-
249
- def generate_ml_predictions(forecasts_df):
250
- model = load_model()
251
- if model is None:
252
- return None
253
-
254
- feature_cols = [
255
- 'dmi_temp_pred', 'dmi_wind_pred', 'dmi_pressure_pred', 'dmi_humidity_pred',
256
- 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
257
- 'hour', 'day_of_year'
258
- ]
259
-
260
- features = []
261
- for _, row in forecasts_df.iterrows():
262
- feat = get_features_for_prediction(row)
263
- features.append(feat)
264
-
265
- X = pd.DataFrame(features)
266
-
267
- corrections = model.predict(X[feature_cols])
268
- forecasts_df = forecasts_df.copy()
269
- forecasts_df['ml_pred'] = forecasts_df['dmi_temp_pred'] + corrections
270
-
271
- return forecasts_df
272
-
273
- def backfill_historical_data():
274
- start_date = datetime(2025, 11, 1).date()
275
- end_date = now_cph().date()
276
-
277
- print(f"🔄 Henter fra {start_date} til {end_date}")
278
-
279
- all_data = []
280
- current_month_start = start_date
281
-
282
- while current_month_start <= end_date:
283
- if current_month_start.month == 12:
284
- next_month = datetime(current_month_start.year + 1, 1, 1).date()
285
- else:
286
- next_month = datetime(current_month_start.year, current_month_start.month + 1, 1).date()
287
-
288
- month_end = min(next_month - timedelta(days=1), end_date)
289
-
290
- print(f"🔄 Henter {current_month_start.strftime('%Y-%m')}...")
291
-
292
- forecasts = fetch_forecasts_for_period(current_month_start, month_end)
293
-
294
- if forecasts is not None and len(forecasts) > 0:
295
- min_target = forecasts['timestamp'].min().date()
296
- max_target = forecasts['timestamp'].max().date()
297
-
298
- actuals = fetch_actuals_for_period(
299
- min_target - timedelta(days=2),
300
- max_target + timedelta(days=2)
301
- )
302
-
303
- if actuals is not None:
304
- merged = pd.merge(forecasts, actuals, on='timestamp', how='inner')
305
-
306
- if len(merged) > 0:
307
- merged['hour'] = merged['reference_time'].dt.hour
308
- merged['day_of_year'] = merged['reference_time'].dt.dayofyear
309
- merged['month'] = merged['reference_time'].dt.month
310
- merged['hour_sin'] = np.sin(2 * np.pi * merged['hour'] / 24)
311
- merged['hour_cos'] = np.cos(2 * np.pi * merged['hour'] / 24)
312
- merged['month_sin'] = np.sin(2 * np.pi * merged['month'] / 12)
313
- merged['month_cos'] = np.cos(2 * np.pi * merged['month'] / 12)
314
- merged['dmi_error'] = merged['actual_temp'] - merged['dmi_temp_pred']
315
-
316
- all_data.append(merged)
317
- print(f"✅ {len(merged)} rækker")
318
-
319
- current_month_start = next_month
320
-
321
- if not all_data:
322
- return "❌ Ingen data"
323
-
324
- final_df = pd.concat(all_data, ignore_index=True)
325
- # Fjern fremtidige tider: behold kun rækker hvor timestamp er mindre eller lig med nuværende time
326
- current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
327
- final_df = final_df[final_df['timestamp'] <= current_hour]
328
- # Drop duplicates baseret på timestamp (target tid)
329
- final_df = final_df.drop_duplicates(subset=['timestamp'], keep='first')
330
-
331
- try:
332
- final_df.to_parquet("data.parquet")
333
- api = HfApi()
334
- api.upload_file(
335
- path_or_fileobj="data.parquet",
336
- path_in_repo="data.parquet",
337
- repo_id=DATASET_NAME,
338
- repo_type="dataset",
339
- token=HF_TOKEN
340
- )
341
- return f"✅ {len(final_df)} rækker med timestamp som nøgle"
342
- except Exception as e:
343
- return f"❌ Fejl: {str(e)}"
344
-
345
- def update_daily():
346
- end_date = now_cph().date()
347
- start_date = end_date - timedelta(days=7)
348
-
349
- print(f"⏰ København tid: {now_cph()}")
350
-
351
- forecasts = fetch_forecasts_for_period(start_date, end_date)
352
- if forecasts is None:
353
- return "❌ Ingen forecasts"
354
-
355
- min_target = forecasts['timestamp'].min().date()
356
- max_target = forecasts['timestamp'].max().date()
357
- actuals = fetch_actuals_for_period(min_target - timedelta(days=2), max_target)
358
-
359
- if actuals is None:
360
- return "❌ Ingen actuals"
361
-
362
- merged = pd.merge(forecasts, actuals, on='timestamp', how='inner')
363
- if len(merged) == 0:
364
- return "❌ Ingen match"
365
-
366
- # Fjern fremtidige tider: behold kun rækker hvor timestamp er mindre eller lig med nuværende time
367
- current_hour = now_cph().replace(minute=0, second=0, microsecond=0)
368
- merged = merged[merged['timestamp'] <= current_hour]
369
-
370
- merged['hour'] = merged['reference_time'].dt.hour
371
- merged['day_of_year'] = merged['reference_time'].dt.dayofyear
372
- merged['month'] = merged['reference_time'].dt.month
373
- merged['hour_sin'] = np.sin(2 * np.pi * merged['hour'] / 24)
374
- merged['hour_cos'] = np.cos(2 * np.pi * merged['hour'] / 24)
375
- merged['month_sin'] = np.sin(2 * np.pi * merged['month'] / 12)
376
- merged['month_cos'] = np.cos(2 * np.pi * merged['month'] / 12)
377
- merged['dmi_error'] = merged['actual_temp'] - merged['dmi_temp_pred']
378
-
379
- try:
380
- dataset = load_dataset(DATASET_NAME, split="train")
381
- existing = dataset.to_pandas()
382
-
383
- if 'timestamp' not in existing.columns:
384
- return "❌ Eksisterende data mangler timestamp kolonne"
385
-
386
- if existing['timestamp'].dt.tz is None:
387
- existing['timestamp'] = existing['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
388
- else:
389
- existing['timestamp'] = existing['timestamp'].dt.tz_convert('Europe/Copenhagen')
390
-
391
- # Fjern dubletter baseret på timestamp (target tid)
392
- existing_ts = set(existing['timestamp'])
393
- mask = ~merged['timestamp'].isin(existing_ts)
394
- new_data = merged[mask]
395
-
396
- if len(new_data) == 0:
397
- return "ℹ️ Ingen nye data"
398
-
399
- combined = pd.concat([existing, new_data], ignore_index=True)
400
- # Sikr ingen duplicates i combined
401
- combined = combined.drop_duplicates(subset=['timestamp'], keep='first')
402
-
403
- status_msg = f"✅ {len(new_data)} nye rækker tilføjet"
404
- except Exception as e:
405
- print(f"Info: {e}")
406
- combined = merged
407
- status_msg = f"✅ {len(merged)} rækker gemt (nyt datasæt)"
408
-
409
- combined.to_parquet("data.parquet")
410
- api = HfApi()
411
- api.upload_file(path_or_fileobj="data.parquet", path_in_repo="data.parquet",
412
- repo_id=DATASET_NAME, repo_type="dataset", token=HF_TOKEN)
413
-
414
- return status_msg
415
-
416
- def update_predictions():
417
- current_time = now_cph()
418
- print(f"🔮 Genererer live predictions: {current_time}")
419
-
420
- future_forecasts = fetch_future_forecasts()
421
- if future_forecasts is None or len(future_forecasts) == 0:
422
- return "❌ Kunne ikke hente fremtidige forecasts"
423
-
424
- predictions = generate_ml_predictions(future_forecasts)
425
- if predictions is None:
426
- return "❌ Kunne ikke loade model"
427
-
428
- predictions['prediction_made_at'] = current_time
429
- predictions['city'] = 'aarhus'
430
- predictions['verified'] = False
431
- predictions['actual_temp'] = None
432
-
433
- try:
434
- dataset = load_dataset(PREDICTIONS_DATASET, split="train")
435
- existing = dataset.to_pandas()
436
-
437
- if 'timestamp' in existing.columns:
438
- if existing['timestamp'].dt.tz is None:
439
- existing['timestamp'] = existing['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
440
-
441
- # Fjern duplicates baseret på timestamp (target tidspunkt)
442
- # Hver target tid skal kun have én prediction
443
- new_timestamps = set(predictions['timestamp'])
444
- existing = existing[~existing['timestamp'].isin(new_timestamps)]
445
-
446
- combined = pd.concat([existing, predictions], ignore_index=True)
447
- # Drop duplicates igen for sikkerheds skyld
448
- combined = combined.drop_duplicates(subset=['timestamp'], keep='first')
449
- else:
450
- combined = predictions
451
- except:
452
- combined = predictions
453
-
454
- try:
455
- combined.to_parquet("predictions.parquet")
456
- api = HfApi()
457
- api.upload_file(
458
- path_or_fileobj="predictions.parquet",
459
- path_in_repo="predictions.parquet",
460
- repo_id=PREDICTIONS_DATASET,
461
- repo_type="dataset",
462
- token=HF_TOKEN
463
- )
464
- return f"✅ {len(predictions)} nye predictions gemt ({predictions['timestamp'].min()} til {predictions['timestamp'].max()})"
465
- except Exception as e:
466
- return f"❌ Fejl: {str(e)}"
467
-
468
- def verify_past_predictions():
469
- try:
470
- dataset = load_dataset(PREDICTIONS_DATASET, split="train")
471
- pred_df = dataset.to_pandas()
472
-
473
- if 'timestamp' not in pred_df.columns:
474
- return "❌ Ingen timestamp kolonne"
475
-
476
- if pred_df['timestamp'].dt.tz is None:
477
- pred_df['timestamp'] = pred_df['timestamp'].dt.tz_localize('Europe/Copenhagen', ambiguous='infer')
478
-
479
- now = now_cph()
480
- to_verify = pred_df[
481
- (~pred_df['verified']) &
482
- (pred_df['timestamp'] < now - timedelta(hours=1))
483
- ]
484
-
485
- if len(to_verify) == 0:
486
- return "Ingen at verificere"
487
-
488
- start_date = to_verify['timestamp'].min().date()
489
- end_date = to_verify['timestamp'].max().date()
490
- actuals = fetch_actuals_for_period(start_date, end_date)
491
-
492
- if actuals is None:
493
- return "Kunne ikke hente actuals"
494
-
495
- for idx, row in to_verify.iterrows():
496
- match = actuals[actuals['timestamp'] == row['timestamp']]
497
- if len(match) > 0:
498
- pred_df.loc[idx, 'actual_temp'] = match.iloc[0]['actual_temp']
499
- pred_df.loc[idx, 'verified'] = True
500
-
501
- pred_df.to_parquet("predictions.parquet")
502
- api = HfApi()
503
- api.upload_file(
504
- path_or_fileobj="predictions.parquet",
505
- path_in_repo="predictions.parquet",
506
- repo_id=PREDICTIONS_DATASET,
507
- repo_type="dataset",
508
- token=HF_TOKEN
509
- )
510
-
511
- return f"{len(to_verify)} verificeret"
512
-
513
- except Exception as e:
514
- return f"Verificeringsfejl: {e}"
515
-
516
- def run_scheduler():
517
- schedule.every().day.at("06:00").do(update_daily)
518
- while True:
519
- schedule.run_pending()
520
- time.sleep(60)
521
-
522
- scheduler_thread = threading.Thread(target=run_scheduler)
523
- scheduler_thread.daemon = True
524
- scheduler_thread.start()
525
-
526
- with gr.Blocks(title="DMI Collector + Live Predictions") as demo:
527
- gr.Markdown("""
528
- # 🌤️ DMI Data Collector + Live Predictions
529
- """)
530
-
531
- status = gr.Textbox(label="Status", lines=10)
532
-
533
- with gr.Row():
534
- btn_backfill = gr.Button("🚀 Hent historisk data", variant="primary")
535
- btn_daily = gr.Button("🔄 Opdater træningsdata", variant="secondary")
536
- btn_predict = gr.Button("🔮 Generér Live Predictions NU", variant="primary")
537
-
538
- btn_backfill.click(backfill_historical_data, outputs=status)
539
- btn_daily.click(update_daily, outputs=status)
540
- btn_predict.click(update_predictions, outputs=status)
541
-
542
- demo.launch()