"""Extract CheMeleon embeddings (2048-dim frozen) for train + test. Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv Writes: data/processed/features/chemeleon_train.npy, chemeleon_test.npy """ from __future__ import annotations import sys, time from pathlib import Path import numpy as np import pandas as pd EXP_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(EXP_ROOT / "code")) OUT_DIR = EXP_ROOT / "data" / "processed" / "features" OUT_DIR.mkdir(parents=True, exist_ok=True) def main(): from chemeleon_fingerprint import CheMeleonFingerprint p_tr = OUT_DIR / "chemeleon_train.npy" p_te = OUT_DIR / "chemeleon_test.npy" if p_tr.exists() and p_te.exists(): print(f" cache hit: {p_tr.name}, {p_te.name}") a = np.load(p_tr); b = np.load(p_te) print(f" shapes: train={a.shape} test={b.shape}") return 0 train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv") test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv") print(f"loading CheMeleon...") fp = CheMeleonFingerprint() print(f"computing train embeddings ({len(train)})...") t0 = time.time() # batch in chunks of 256 to avoid OOM BATCH = 256 train_embs = [] for i in range(0, len(train), BATCH): chunk = train["SMILES"].iloc[i:i+BATCH].tolist() train_embs.append(fp(chunk)) print(f" {i+len(chunk)}/{len(train)} ({(time.time()-t0):.1f}s)") train_emb = np.vstack(train_embs) np.save(p_tr, train_emb) print(f"computing test embeddings ({len(test)})...") test_embs = [] for i in range(0, len(test), BATCH): chunk = test["SMILES"].iloc[i:i+BATCH].tolist() test_embs.append(fp(chunk)) test_emb = np.vstack(test_embs) np.save(p_te, test_emb) print(f"saved train={train_emb.shape} test={test_emb.shape} in {time.time()-t0:.1f}s") return 0 if __name__ == "__main__": sys.exit(main())