| """Extract CheMeleon embeddings (2048-dim frozen) for train + test. |
| |
| Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv |
| Writes: data/processed/features/chemeleon_train.npy, chemeleon_test.npy |
| """ |
| from __future__ import annotations |
| import sys, time |
| from pathlib import Path |
| import numpy as np |
| import pandas as pd |
|
|
| EXP_ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(EXP_ROOT / "code")) |
| OUT_DIR = EXP_ROOT / "data" / "processed" / "features" |
| OUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def main(): |
| from chemeleon_fingerprint import CheMeleonFingerprint |
|
|
| p_tr = OUT_DIR / "chemeleon_train.npy" |
| p_te = OUT_DIR / "chemeleon_test.npy" |
| if p_tr.exists() and p_te.exists(): |
| print(f" cache hit: {p_tr.name}, {p_te.name}") |
| a = np.load(p_tr); b = np.load(p_te) |
| print(f" shapes: train={a.shape} test={b.shape}") |
| return 0 |
|
|
| train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv") |
| test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv") |
| print(f"loading CheMeleon...") |
| fp = CheMeleonFingerprint() |
| print(f"computing train embeddings ({len(train)})...") |
| t0 = time.time() |
| |
| BATCH = 256 |
| train_embs = [] |
| for i in range(0, len(train), BATCH): |
| chunk = train["SMILES"].iloc[i:i+BATCH].tolist() |
| train_embs.append(fp(chunk)) |
| print(f" {i+len(chunk)}/{len(train)} ({(time.time()-t0):.1f}s)") |
| train_emb = np.vstack(train_embs) |
| np.save(p_tr, train_emb) |
|
|
| print(f"computing test embeddings ({len(test)})...") |
| test_embs = [] |
| for i in range(0, len(test), BATCH): |
| chunk = test["SMILES"].iloc[i:i+BATCH].tolist() |
| test_embs.append(fp(chunk)) |
| test_emb = np.vstack(test_embs) |
| np.save(p_te, test_emb) |
| print(f"saved train={train_emb.shape} test={test_emb.shape} in {time.time()-t0:.1f}s") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|