# ============================================================ # GHOST — Graph-based Hierarchical On-the-fly # Self-correcting Threat detector # SINGLE FILE — paste entirely into ONE Colab cell and run # ============================================================ # CELL 1: !pip install torch networkx scikit-learn pandas numpy matplotlib scipy -q # CELL 2: paste this entire file and run # ============================================================ import os, sys, copy, random, warnings import numpy as np import pandas as pd import networkx as nx import torch import torch.nn as nn import torch.nn.functional as F import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from datetime import datetime, timedelta from dataclasses import dataclass, field from typing import List, Dict, Tuple, Optional from torch.utils.data import Dataset, DataLoader from sklearn.preprocessing import StandardScaler warnings.filterwarnings("ignore") SEED = 42 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" np.random.seed(SEED); random.seed(SEED); torch.manual_seed(SEED) os.makedirs("/content/GHOST_outputs", exist_ok=True) print(f"GHOST running on: {DEVICE.upper()}") # ============================================================ # MODULE 1 — DATA GENERATOR # ============================================================ DISRUPTION_PARAMS = { "port_strike": {"delay": 0.8, "cost": 1.5, "dur": 7, "sev": 0.70}, "raw_material_shortage": {"delay": 0.6, "cost": 2.0, "dur": 14, "sev": 0.80}, "transportation_delay": {"delay": 0.7, "cost": 1.3, "dur": 3, "sev": 0.50}, "geopolitical_conflict": {"delay": 0.4, "cost": 2.5, "dur": 21, "sev": 0.90}, "natural_disaster": {"delay": 0.3, "cost": 3.0, "dur": 10, "sev": 1.00}, "demand_surge": {"delay": 0.9, "cost": 1.2, "dur": 5, "sev": 0.30}, "supplier_bankruptcy": {"delay": 0.2, "cost": 4.0, "dur": 30, "sev": 0.95}, "cyber_attack": {"delay": 0.5, "cost": 2.2, "dur": 7, "sev": 0.75}, } TEXT_MAP = { "port_strike": "workers at the port have gone on strike halting all shipments", "raw_material_shortage": "critical raw materials are unavailable due to mining disruptions", "transportation_delay": "severe weather has caused major transportation delays", "geopolitical_conflict": "geopolitical tensions have closed key trade routes", "natural_disaster": "earthquake has destroyed key infrastructure nodes", "demand_surge": "unexpected demand spike has overwhelmed supply capacity", "supplier_bankruptcy": "primary supplier has filed for bankruptcy", "cyber_attack": "ransomware attack has disabled logistics management systems", } NODE_TYPES = ["supplier", "manufacturer", "warehouse", "distributor", "retailer"] def generate_supply_chain_graph(n_nodes=20): G = nx.DiGraph() npt = n_nodes // len(NODE_TYPES) nid = 0 for ntype in NODE_TYPES: for _ in range(npt): G.add_node(nid, type=ntype, capacity=np.random.uniform(100,1000), reliability=np.random.uniform(0.7,1.0), inventory=np.random.uniform(50,500), lead_time=np.random.randint(1,30), criticality=np.random.uniform(0.3,1.0)) nid += 1 tn = {t:[n for n,d in G.nodes(data=True) if d["type"]==t] for t in NODE_TYPES} for i in range(len(NODE_TYPES)-1): for src in tn[NODE_TYPES[i]]: tgts = random.sample(tn[NODE_TYPES[i+1]], min(np.random.randint(1,3), len(tn[NODE_TYPES[i+1]]))) for dst in tgts: G.add_edge(src, dst, weight=np.random.uniform(0.5,1.0), transport_cost=np.random.uniform(10,100), transit_time=np.random.randint(1,14), reliability=np.random.uniform(0.8,1.0)) return G def generate_time_series(n_days=365, n_nodes=20, disruption_prob=0.05): records = [] start = datetime(2023,1,1) for day in range(n_days): date = start + timedelta(days=day) for nid in range(n_nodes): dis, dtype, delay, ci, sev = 0,"none",0.0,1.0,0.0 if np.random.random() < disruption_prob: dis = 1 dtype = random.choice(list(DISRUPTION_PARAMS.keys())) p = DISRUPTION_PARAMS[dtype] delay = (1-p["delay"])*np.random.uniform(0.8,1.2) ci = p["cost"]*np.random.uniform(0.9,1.1) sev = p["sev"]*np.random.uniform(0.8,1.0) records.append({"date":date,"node_id":nid, "demand":np.random.normal(100,20)*(1+0.3*np.sin(2*np.pi*day/365)), "supply":np.random.normal(95,15), "inventory_level":np.random.uniform(20,200), "lead_time":np.random.randint(1,30), "transport_cost":np.random.uniform(10,100)*ci, "on_time_delivery":np.random.uniform(0.7,1.0)*(1-delay), "disruption":dis,"disruption_type":dtype, "delay_factor":delay,"cost_impact":ci, "severity":sev,"risk_score":dis*sev}) return pd.DataFrame(records) def create_features(df, seq_len=30): fcols = ["demand","supply","inventory_level","lead_time", "transport_cost","on_time_delivery","delay_factor","cost_impact"] X,y = [],[] for nid in df["node_id"].unique(): ndf = df[df["node_id"]==nid].sort_values("date").reset_index(drop=True) feats = ndf[fcols].values labels = ndf["risk_score"].values for i in range(seq_len, len(feats)): X.append(feats[i-seq_len:i]); y.append(labels[i]) return np.array(X,dtype=np.float32), np.array(y,dtype=np.float32) def oversample_disruptions(X, y, ratio=8): """Fix class imbalance — repeat disruption samples so model learns to predict them.""" # use higher threshold for real data (0.3 = top 30th percentile) threshold = np.percentile(y, 70) # top 30% are considered "high risk" dis_idx = np.where(y > threshold)[0] nor_idx = np.where(y <= threshold)[0] if len(dis_idx) == 0: print(f" No high-risk samples found (threshold={threshold:.3f})") return X, y # adaptive ratio: if disruptions are very rare, use lower ratio actual_dis_rate = len(dis_idx) / len(y) if actual_dis_rate < 0.05: # less than 5% disruptions ratio = 2 # very mild oversampling print(f" [Adaptive] Rare disruptions ({actual_dis_rate:.2%}) → ratio={ratio}") elif actual_dis_rate < 0.15: # less than 15% ratio = min(ratio, 4) print(f" [Adaptive] Low disruption rate ({actual_dis_rate:.2%}) → ratio={ratio}") rep = np.repeat(dis_idx, ratio) idx = np.concatenate([nor_idx, rep]) np.random.shuffle(idx) final_rate = (y[idx] > threshold).mean() print(f" Oversampled: {len(idx)} samples | High-risk rate: {final_rate:.2%} (threshold={threshold:.3f})") return X[idx], y[idx] # ============================================================ # MODULE 2 — LSTM RISK PREDICTOR # ============================================================ class SCDataset(Dataset): def __init__(self,X,y): self.X=torch.FloatTensor(X); self.y=torch.FloatTensor(y) def __len__(self): return len(self.X) def __getitem__(self,i): return self.X[i],self.y[i] class GHOSTLSTMPredictor(nn.Module): def __init__(self,input_dim=8,hidden_dim=128,num_layers=2,dropout=0.3): super().__init__() self.lstm = nn.LSTM(input_dim,hidden_dim,num_layers, batch_first=True,dropout=dropout,bidirectional=True) self.attn = nn.Sequential(nn.Linear(hidden_dim*2,64),nn.Tanh(),nn.Linear(64,1)) self.head = nn.Sequential(nn.Linear(hidden_dim*2,64),nn.ReLU(), nn.Dropout(dropout),nn.Linear(64,32), nn.ReLU(),nn.Linear(32,1),nn.Sigmoid()) self.last_emb = None def forward(self,x): out,_ = self.lstm(x) w = torch.softmax(self.attn(out).squeeze(-1),dim=1) ctx = (out*w.unsqueeze(-1)).sum(dim=1) self.last_emb = ctx.detach() return self.head(ctx).squeeze(-1), ctx class GHOSTTrainer: def __init__(self,model,lr=1e-3,device=DEVICE): self.model=model.to(device); self.device=device self.opt = torch.optim.AdamW(model.parameters(),lr=lr,weight_decay=1e-4) self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt,T_max=50) # weighted MSE: penalise missed disruptions 5x more than false alarms self.loss = nn.MSELoss() self.sc = StandardScaler() self.history = {"train_loss":[],"val_loss":[]} def _weighted_loss(self, pred, target): weights = 1.0 + 4.0 * (target > 0.05).float() # 5x weight on disruption samples return (weights * (pred - target)**2).mean() def _ep(self,loader,train=True): self.model.train(train); tot=0.0 cm = torch.enable_grad() if train else torch.no_grad() with cm: for Xb,yb in loader: Xb,yb=Xb.to(self.device),yb.to(self.device) p,_=self.model(Xb) l = self._weighted_loss(p,yb) if train else self.loss(p,yb) if train: self.opt.zero_grad(); l.backward() nn.utils.clip_grad_norm_(self.model.parameters(),1.0); self.opt.step() tot+=l.item() return tot/len(loader) def train(self,Xtr,ytr,Xva,yva,epochs=50,bs=64): B,T,F=Xtr.shape Xs = self.sc.fit_transform(Xtr.reshape(-1,F)).reshape(B,T,F) Bv,Tv,_=Xva.shape Xvs = self.sc.transform(Xva.reshape(-1,F)).reshape(Bv,Tv,F) tl=DataLoader(SCDataset(Xs,ytr),bs,shuffle=True) vl=DataLoader(SCDataset(Xvs,yva),bs) best,bs_=float("inf"),None for ep in range(epochs): tr=self._ep(tl,True); va=self._ep(vl,False); self.sched.step() self.history["train_loss"].append(tr); self.history["val_loss"].append(va) if vaijh",Q,K)/(self.hd**0.5) sc=sc*self.gate(er) sc=sc.masked_fill((adj==0).unsqueeze(-1).expand_as(sc),-1e9) at=F.softmax(sc,dim=1) o=torch.einsum("ijh,jhd->ihd",at,V).reshape(N,D) return self.out(o) class GHOSTGNNBlock(nn.Module): def __init__(self,nd,ed,rd): super().__init__() self.gi=GlobalRiskInject(nd,rd); self.ea=RiskEdgeAttn(nd,ed) self.n1=nn.LayerNorm(nd); self.n2=nn.LayerNorm(nd) self.ff=nn.Sequential(nn.Linear(nd,nd*2),nn.GELU(),nn.Linear(nd*2,nd)) def forward(self,h,adj,er,gr): h=self.gi(h,gr); h=self.n1(h+self.ea(h,adj,er)); h=self.n2(h+self.ff(h)) return h class GHOSTGNNSimulator(nn.Module): def __init__(self,nfd=6,efd=4,rd=16,hd=64,nl=3): super().__init__() self.ne=nn.Linear(nfd,hd); self.re=nn.Linear(1,rd) self.layers=nn.ModuleList([GHOSTGNNBlock(hd,efd,rd) for _ in range(nl)]) self.head=nn.Sequential(nn.Linear(hd,32),nn.ReLU(),nn.Linear(32,1),nn.Sigmoid()) def forward(self,nf,adj,er,gr): h=self.ne(nf); g=self.re(gr.view(1)) for l in self.layers: h=l(h,adj,er,g) return self.head(h).squeeze(-1), h def graph_to_tensors(G, global_risk=0.5, device=DEVICE): N=G.number_of_nodes() nf,adj,er=[],np.zeros((N,N)),np.zeros((N,N,4)) for n in range(N): d=G.nodes[n] nf.append([d.get("capacity",500)/1000, d.get("reliability",0.9), d.get("inventory",100)/500, d.get("lead_time",10)/30, G.in_degree(n)/N, G.out_degree(n)/N]) for u,v,d in G.edges(data=True): adj[u,v]=1 er[u,v]=[d.get("weight",0.5), d.get("transport_cost",50)/100, d.get("transit_time",7)/14, d.get("reliability",0.9)] return (torch.FloatTensor(nf).to(device), torch.FloatTensor(adj).to(device), torch.FloatTensor(er).to(device), torch.FloatTensor([global_risk]).to(device)) # ============================================================ # MODULE 4 — BOOTSTRAPPED DISRUPTION GENERATOR (NOVEL) # ============================================================ @dataclass class DisruptionScenario: disruption_type: str affected_nodes: List[int] severity: float duration: int delay_factor: float cost_impact: float risk_score: float features: np.ndarray = field(default_factory=lambda: np.array([])) text_description:str = "" class GHOSTDisruptionGenerator: def __init__(self, n_nodes=20, device=DEVICE): self.n_nodes=n_nodes; self.device=device self.generated: List[DisruptionScenario] = [] def _feats(self, dtype, severity): p=DISRUPTION_PARAMS[dtype] base=np.array([100*(1-p["delay"]), 95*p["delay"], np.random.uniform(20,100), p["dur"], 50*p["cost"], p["delay"]*0.9, 1-p["delay"], p["cost"]]) return np.clip(base+np.random.normal(0,0.05,8)*base, 0, None).astype(np.float32) def _score(self, sc, risk_model=None): p=DISRUPTION_PARAMS[sc.disruption_type] s=(sc.severity*0.4+(1-p["delay"])*0.3+min(p["cost"]/4,1)*0.3) if risk_model is not None: try: feat=torch.FloatTensor(sc.features[np.newaxis,np.newaxis,:] .repeat(30,axis=1)).to(self.device) with torch.no_grad(): pred,_=risk_model(feat) s=0.6*s+0.4*pred.item() except: pass return float(np.clip(s,0,1)) @staticmethod def _diverse(cand, selected, thr=0.85): for s in selected: sim=float(np.dot(cand.features,s.features)/ (np.linalg.norm(cand.features)*np.linalg.norm(s.features)+1e-8)) if sim>thr: return False return True def generate(self, n=16, risk_model=None, top_k=8): cands=[] for _ in range(n): dtype=np.random.choice(list(DISRUPTION_PARAMS.keys())) p=DISRUPTION_PARAMS[dtype] naff=np.random.randint(1,max(2,self.n_nodes//4)) aff=np.random.choice(self.n_nodes,naff,replace=False).tolist() sev=p["sev"]*np.random.uniform(0.8,1.2) feats=self._feats(dtype,sev) sc=DisruptionScenario(dtype,aff,float(np.clip(sev,0,1)), int(p["dur"]*np.random.uniform(0.8,1.2)), float(1-p["delay"]*np.random.uniform(0.9,1.1)), float(p["cost"]*np.random.uniform(0.9,1.1)), 0.0, feats, TEXT_MAP[dtype]) sc.risk_score=self._score(sc,risk_model); cands.append(sc) cands.sort(key=lambda s:s.risk_score,reverse=True) sel=[] for sc in cands: if self._diverse(sc,sel): sel.append(sc) if len(sel)>=top_k: break if len(sel)=top_k: break self.generated.extend(sel); return sel def to_training(self, scenarios, seq_len=30): X,y=[],[] for sc in scenarios: seq=np.tile(sc.features,(seq_len,1)) X.append((seq+np.random.normal(0,0.02,seq.shape)).astype(np.float32)) y.append(sc.risk_score) return np.array(X,dtype=np.float32), np.array(y,dtype=np.float32) def zero_shot(self, text): KMAP={"strike":"port_strike","port":"port_strike","dock":"port_strike", "shortage":"raw_material_shortage","material":"raw_material_shortage", "weather":"transportation_delay","flood":"transportation_delay", "storm":"transportation_delay","war":"geopolitical_conflict", "sanction":"geopolitical_conflict","conflict":"geopolitical_conflict", "earthquake":"natural_disaster","hurricane":"natural_disaster", "disaster":"natural_disaster","fire":"natural_disaster", "demand":"demand_surge","surge":"demand_surge","spike":"demand_surge", "bankrupt":"supplier_bankruptcy","closure":"supplier_bankruptcy", "cyber":"cyber_attack","hack":"cyber_attack","ransomware":"cyber_attack"} tl=text.lower(); scores={t:0.0 for t in DISRUPTION_PARAMS} for kw,dt in KMAP.items(): if kw in tl: scores[dt]+=1.0 for dt,desc in TEXT_MAP.items(): scores[dt]+=len(set(desc.lower().split())&set(tl.split()))*0.3 best=max(scores,key=scores.get) conf=min(scores[best]/5.0,1.0) if scores[best]>0 else 0.1 p=DISRUPTION_PARAMS[best] naff=np.random.randint(1,max(2,self.n_nodes//4)) aff=np.random.choice(self.n_nodes,naff,replace=False).tolist() sev=p["sev"]*(0.5+0.5*conf) feats=self._feats(best,sev) sc=DisruptionScenario(best,aff,float(np.clip(sev,0,1)), int(p["dur"]*(0.8+0.4*conf)), float(1-p["delay"]*conf), float(p["cost"]*(0.5+0.5*conf)), float(sev),feats,text) print(f" [Zero-Shot] '{text[:55]}' → {best} (conf={conf:.2f}) risk={sc.risk_score:.3f}") return sc # ============================================================ # MODULE 5 — BOOTSTRAP SELF-DISTILLATION LOOP (NOVEL) # ============================================================ class GHOSTBootstrapLoop: def __init__(self, trainer, generator): self.trainer=trainer; self.gen=generator; self.history=[] def _blend(self, Xr, yr, Xs, ys, alpha=0.55): nr=int(len(Xr)*alpha); ns=min(len(Xs),max(1,len(Xr)-nr)) ir=np.random.choice(len(Xr),nr,replace=False) is_=np.random.choice(len(Xs),ns,replace=False) return np.concatenate([Xr[ir],Xs[is_]]), np.concatenate([yr[ir],ys[is_]]) def _mse(self, Xv, yv): p,_=self.trainer.predict(Xv); return float(np.mean((p-yv)**2)) def run(self, Xtr, ytr, Xva, yva, n_iter=3, n_sc=16, top_k=8, epochs=15): print("\n"+"="*50+"\n GHOST Bootstrap Self-Distillation\n"+"="*50) base=self._mse(Xva,yva); best=base best_state=copy.deepcopy(self.trainer.model.state_dict()) print(f" Baseline MSE: {base:.6f}") for it in range(n_iter): print(f"\n Iteration {it+1}/{n_iter}") scs=self.gen.generate(n_sc,self.trainer.model,top_k) print(f" Scenarios: {len(scs)} | scores: {[round(s.risk_score,3) for s in scs]}") sX,sy=self.gen.to_training(scs) Xb,yb=self._blend(Xtr,ytr,sX,sy) # mild ratio=2 keeps disruption rate ~30%, avoids distribution shift Xb_os,yb_os=oversample_disruptions(Xb,yb,ratio=2) sp=int(0.85*len(Xb_os)) self.trainer.train(Xb_os[:sp],yb_os[:sp],Xb_os[sp:],yb_os[sp:],epochs=epochs) mse=self._mse(Xva,yva); imp=(best-mse)/best*100 print(f" MSE={mse:.6f} Δ={imp:+.2f}%") self.history.append({"iter":it+1,"mse":mse,"imp":imp}) if mse=steps: break Xb,yb=Xb.to(self.device),yb.to(self.device) p,_=self.model(Xb); l=self.loss(p,yb) self.opt.zero_grad(); l.backward() nn.utils.clip_grad_norm_(self.model.parameters(),1.0); self.opt.step() step+=1 def correct(self, Xp, yp, sc, rounds=2): print("\n [Drift Correction]") for r in range(rounds): # use drifted embedding on round 1 to demonstrate correction if r == 0: emb = self._inject_drift(Xp, sc) print(f" Round {r+1}: [simulated distribution shift applied]") else: emb = self._embed(Xp, sc) d=self._drift(emb) print(f" Round {r+1}: drift={d:.6f} (thr={self.THRESHOLD})") self.history.append(d) if d>self.THRESHOLD: self.ref=0.65*self.ref+0.35*emb self.ref/=(np.linalg.norm(self.ref)+1e-8) self._finetune(Xp,yp,sc); print(" ✓ Corrected") else: print(" ✓ OK") # ============================================================ # MODULE 7 — DECISION SUPPORT # ============================================================ RISK_LEVELS = [(0.0,0.2,"LOW","🟢"),(0.2,0.4,"MODERATE","🟡"), (0.4,0.6,"HIGH","🟠"),(0.6,0.8,"SEVERE","🔴"),(0.8,1.1,"CRITICAL","🚨")] MITIGATIONS = { "port_strike": ["Reroute via alternative ports","Increase air freight"], "raw_material_shortage": ["Activate secondary suppliers","Increase safety stock"], "transportation_delay": ["Switch transport modes","Expedite critical shipments"], "geopolitical_conflict": ["Diversify supplier geography","Stockpile components"], "natural_disaster": ["Activate disaster recovery","Deploy emergency reserves"], "demand_surge": ["Increase production capacity","Implement demand rationing"], "supplier_bankruptcy": ["Qualify backup suppliers immediately","Audit single-source deps"], "cyber_attack": ["Isolate affected systems","Switch to manual backup processes"], } def risk_level(score): for lo,hi,lv,ic in RISK_LEVELS: if lo<=score4} {'Type':<15} {'Impact':>8} {'Centrality':>11} {'Score':>7}") print(" "+"-"*47) for n in r["critical_nodes"]: print(f" {n['id']:>4} {n['type']:<15} {n['impact']:>8.3f} {n['cent']:>11.3f} {n['score']:>7.3f}") print("\n DISRUPTION SCENARIOS:") for sc in r["scenarios"]: print(f"\n {sc['ic']} {sc['type'].upper().replace('_',' ')} | risk={sc['risk']:.3f} dur={sc['dur']}d") for st in sc["strats"][:2]: print(f" → {st}") print("\n RECOMMENDED ACTIONS:") for i,rec in enumerate(r["recommendations"],1): print(f" {i:2d}. {rec}") print("="*58) # ============================================================ # MODULE 8 — VISUALISATION # ============================================================ def visualize(G, impact_np, trainer, preds, y, split, boot_res, corrector, report): fig, axes = plt.subplots(2, 3, figsize=(18, 11)) fig.suptitle("GHOST — Supply Chain Threat Detection Dashboard", fontsize=15, fontweight="bold") # 1. network graph ax=axes[0,0] pos=nx.spring_layout(G,seed=42) nc=[impact_np[n] if n0.6 else "#f57c00" if s>0.4 else "#fbc02d" if s>0.2 else "#388e3c" for s in scores] ax.barh(names,scores,color=colors) ax.set_xlim(0,1); ax.set_title("Scenario Risk Scores"); ax.set_xlabel("Risk Score") ax.axvline(0.6,color="red",linestyle="--",alpha=0.5,label="Severe") ax.axvline(0.4,color="orange",linestyle="--",alpha=0.5,label="High") ax.legend(fontsize=8); ax.grid(True,alpha=0.3,axis="x") plt.tight_layout() path="/content/GHOST_outputs/ghost_dashboard.png" plt.savefig(path,dpi=150,bbox_inches="tight"); plt.show() print(f"\n Dashboard saved → {path}") # ============================================================ # REAL DATASET ADAPTER — DataCo Smart Supply Chain # No disruption labels needed — risk inferred on-the-fly # Dataset: https://www.kaggle.com/datasets/shashwatwork/dataco-smart-supply-chain-for-big-data-analysis # ============================================================ def load_dataco(csv_path: str, n_nodes: int = 20, seq_len: int = 30): """ Load the DataCo Smart Supply Chain dataset and convert it into GHOST-compatible features with on-the-fly risk inference. No disruption labels required — risk is derived from anomaly detection. """ from scipy.stats import zscore as sp_zscore print(f" Loading real dataset: {csv_path}") df = pd.read_csv(csv_path, encoding="latin-1") print(f" Raw shape: {df.shape} | Columns: {list(df.columns[:8])}...") # ── column mapping (DataCo specific) ───────────────────── col_map = { "Order Item Quantity": "demand", "Order Item Product Price": "supply", "Order Item Total": "transport_cost", "Days for shipping (real)": "lead_time", "Days for shipment (scheduled)":"scheduled_lead", "Order Item Discount Rate": "discount", "Benefit per order": "benefit", "Sales per customer": "sales", } df = df.rename(columns={k:v for k,v in col_map.items() if k in df.columns}) # ── ensure required columns exist ──────────────────────── required = ["demand","supply","transport_cost","lead_time"] for col in required: if col not in df.columns: df[col] = np.random.uniform(50, 150, len(df)) # fill missing for col in ["demand","supply","transport_cost","lead_time", "discount","benefit","sales","scheduled_lead"]: if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce").fillna(df[col].median() if col in df.columns else 0) else: df[col] = np.random.uniform(0, 1, len(df)) # ── on-the-fly risk inference (no labels needed) ───────── # Risk = anomaly score from z-score across key operational metrics risk_cols = ["demand","supply","transport_cost","lead_time"] z_scores = np.abs(sp_zscore(df[risk_cols].fillna(0).values, axis=0)) raw_risk = z_scores.mean(axis=1) # late delivery adds risk if "lead_time" in df.columns and "scheduled_lead" in df.columns: df["delay"] = (df["lead_time"] - df["scheduled_lead"]).clip(lower=0) raw_risk += df["delay"].fillna(0).values * 0.3 # normalize to [0,1] df["risk_score"] = (raw_risk - raw_risk.min()) / (raw_risk.max() - raw_risk.min() + 1e-8) df["risk_score"] = df["risk_score"].clip(0, 1) # ── assign node_ids (simulate supply chain nodes) ───────── df = df.sample(frac=1, random_state=SEED).reset_index(drop=True) df["node_id"] = np.tile(np.arange(n_nodes), int(np.ceil(len(df)/n_nodes)))[:len(df)] # ── on_time_delivery proxy ──────────────────────────────── if "lead_time" in df.columns and "scheduled_lead" in df.columns: df["on_time_delivery"] = (df["lead_time"] <= df["scheduled_lead"]).astype(float) else: df["on_time_delivery"] = np.random.uniform(0.7, 1.0, len(df)) # ── inventory_level proxy ───────────────────────────────── df["inventory_level"] = df["demand"].rolling(7, min_periods=1).mean().fillna(df["demand"].mean()) # ── delay_factor + cost_impact ──────────────────────────── df["delay_factor"] = (df["lead_time"] / (df["lead_time"].max() + 1e-8)).clip(0, 1) df["cost_impact"] = (df["transport_cost"] / (df["transport_cost"].max() + 1e-8)).clip(0, 1) print(f" Processed: {len(df)} rows | " f"Risk range: [{df['risk_score'].min():.3f}, {df['risk_score'].max():.3f}] | " f"Mean risk: {df['risk_score'].mean():.3f}") # ── build sequences ─────────────────────────────────────── fcols = ["demand","supply","inventory_level","lead_time", "transport_cost","on_time_delivery","delay_factor","cost_impact"] X, y = [], [] for nid in range(n_nodes): ndf = df[df["node_id"]==nid].reset_index(drop=True) if len(ndf) < seq_len + 1: continue feats = ndf[fcols].values.astype(np.float32) labels = ndf["risk_score"].values.astype(np.float32) for i in range(seq_len, len(feats)): X.append(feats[i-seq_len:i]); y.append(labels[i]) X = np.array(X, dtype=np.float32) y = np.array(y, dtype=np.float32) print(f" Sequences: X={X.shape} | y={y.shape} | " f"High-risk rate: {(y>0.5).mean():.2%}") return df, X, y def build_graph_from_dataco(df: pd.DataFrame, n_nodes: int = 20) -> nx.DiGraph: """Build a supply chain graph from DataCo order flow data.""" G = nx.DiGraph() for nid in range(n_nodes): ndf = df[df["node_id"]==nid] G.add_node(nid, type=NODE_TYPES[nid % len(NODE_TYPES)], capacity=float(ndf["demand"].mean()) if len(ndf)>0 else 500, reliability=float(ndf["on_time_delivery"].mean()) if len(ndf)>0 else 0.9, inventory=float(ndf["inventory_level"].mean()) if len(ndf)>0 else 100, lead_time=float(ndf["lead_time"].mean()) if len(ndf)>0 else 10, criticality=float(ndf["risk_score"].mean()) if len(ndf)>0 else 0.5) # connect nodes in supply chain order tn = {t:[n for n,d in G.nodes(data=True) if d["type"]==t] for t in NODE_TYPES} for i in range(len(NODE_TYPES)-1): for src in tn[NODE_TYPES[i]]: tgts = random.sample(tn[NODE_TYPES[i+1]], min(np.random.randint(1,3), len(tn[NODE_TYPES[i+1]]))) for dst in tgts: G.add_edge(src, dst, weight=np.random.uniform(0.5,1.0), transport_cost=np.random.uniform(10,100), transit_time=np.random.randint(1,14), reliability=np.random.uniform(0.8,1.0)) return G # ============================================================ # MAIN PIPELINE — runs all 7 stages # ============================================================ def main(real_csv_path: str = None): """ real_csv_path: path to DataCo CSV (optional). If None, uses synthetic data. Download from: https://www.kaggle.com/datasets/shashwatwork/dataco-smart-supply-chain-for-big-data-analysis File: DataCoSupplyChainDataset.csv """ N_NODES = 20 USE_REAL = real_csv_path is not None and os.path.exists(str(real_csv_path)) # ── Stage 1: Data ──────────────────────────────────────── if USE_REAL: print("\n[Stage 1] Loading REAL DataCo Supply Chain Dataset...") df, X, y = load_dataco(real_csv_path, N_NODES) G = build_graph_from_dataco(df, N_NODES) split = int(0.8 * len(X)) print(f" Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges") print(f" Mode: REAL DATA ✅") else: print("\n[Stage 1] Generating synthetic supply chain data...") print(" (To use real data: main('/content/DataCoSupplyChainDataset.csv'))") G = generate_supply_chain_graph(N_NODES) df = generate_time_series(365, N_NODES, disruption_prob=0.05) X, y = create_features(df, seq_len=30) split = int(0.8 * len(X)) print(f" Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges") print(f" Dataset: {len(df)} rows | X: {X.shape} | Disruption rate: {df['disruption'].mean():.2%}") print(f" Mode: SYNTHETIC DATA") # ── Stage 2: LSTM ──────────────────────────────────────── print("\n[Stage 2] Training LSTM Risk Predictor...") # adaptive ratio: lower for real data with rare disruptions ratio = 3 if USE_REAL else 8 X_bal, y_bal = oversample_disruptions(X, y, ratio=ratio) split_bal = int(0.8 * len(X_bal)) model = GHOSTLSTMPredictor(input_dim=X.shape[2]) trainer = GHOSTTrainer(model, device=DEVICE) trainer.train(X_bal[:split_bal], y_bal[:split_bal], X_bal[split_bal:], y_bal[split_bal:], epochs=40) preds, _ = trainer.predict(X[split:]) print(f" Initial MSE: {np.mean((preds - y[split:])**2):.6f}") print(f" Pred range : [{preds.min():.3f}, {preds.max():.3f}]") # ── Stage 3: Bootstrap ─────────────────────────────────── print("\n[Stage 3] Bootstrap Self-Distillation Loop...") gen = GHOSTDisruptionGenerator(N_NODES, DEVICE) loop = GHOSTBootstrapLoop(trainer, gen) boot_res = loop.run(X_bal[:split_bal], y_bal[:split_bal], X[split:], y[split:], n_iter=3, n_sc=16, top_k=8, epochs=20) # ── Stage 4: Drift Correction ──────────────────────────── print("\n[Stage 4] Probe-Based Drift Correction...") dc = GHOSTDriftCorrector(trainer.model, DEVICE) dc.set_ref(X[split:][:50], trainer.sc) dc.correct(X[split:][:50], y[split:][:50], trainer.sc, rounds=2) # ── Stage 5: GNN Simulation ────────────────────────────── print("\n[Stage 5] GNN Dual-Level Risk Injection Simulation...") final_preds, _ = trainer.predict(X[split:]) global_risk = float(np.mean(final_preds)) nf, ad, er, gr = graph_to_tensors(G, global_risk, DEVICE) gnn = GHOSTGNNSimulator().to(DEVICE) with torch.no_grad(): impact, _ = gnn(nf, ad, er, gr) impact_np = impact.cpu().numpy() print(f" Global risk: {global_risk:.3f} | Mean impact: {impact_np.mean():.3f} | Max: {impact_np.max():.3f}") # ── Stage 6: Zero-Shot ─────────────────────────────────── print("\n[Stage 6] Zero-Shot Disruption Scenario Generation...") test_texts = [ "dock workers have walked off the job demanding higher wages", "a massive earthquake has struck the manufacturing region", "hackers have encrypted all warehouse management systems", "unprecedented consumer demand has emptied all retail shelves", "the government has imposed new trade sanctions on key suppliers", ] zs_scenarios = [gen.zero_shot(t) for t in test_texts] boot_scenarios = gen.generate(n=16, risk_model=trainer.model, top_k=8) all_scenarios = zs_scenarios + boot_scenarios # ── Stage 7: Decision Support ──────────────────────────── print("\n[Stage 7] Decision Support Report...") report = generate_report(all_scenarios, impact_np, G) print_report(report) # ── Visualise ──────────────────────────────────────────── visualize(G, impact_np, trainer, final_preds, y, split, boot_res, dc, report) # ── Metrics Summary ────────────────────────────────────── dis_mask = y[split:] > 0.05 if dis_mask.sum() > 0: dis_preds = final_preds[dis_mask] print(f"\n Disruption detection:") print(f" Samples with disruption : {dis_mask.sum()}") print(f" Mean predicted risk : {dis_preds.mean():.3f}") print(f" Detected (pred > 0.1) : {(dis_preds > 0.1).sum()} / {dis_mask.sum()}") print("\n✅ GHOST pipeline complete.") return report # ── RUN ────────────────────────────────────────────────────── # Option A — synthetic data (default, no download needed): report = main() # Option B — real DataCo dataset (recommended for paper): # 1. Download from Kaggle: # https://www.kaggle.com/datasets/shashwatwork/dataco-smart-supply-chain-for-big-data-analysis # 2. Upload DataCoSupplyChainDataset.csv to Colab # 3. Run: # report = main('/content/DataCoSupplyChainDataset.csv')