openadmet-pxr-challenge-2026 / code /featurization /featurize_chemeleon.py
RyeCatcher's picture
Add files using upload-large-folder tool
1757924 verified
"""Extract CheMeleon embeddings (2048-dim frozen) for train + test.
Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv
Writes: data/processed/features/chemeleon_train.npy, chemeleon_test.npy
"""
from __future__ import annotations
import sys, time
from pathlib import Path
import numpy as np
import pandas as pd
EXP_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(EXP_ROOT / "code"))
OUT_DIR = EXP_ROOT / "data" / "processed" / "features"
OUT_DIR.mkdir(parents=True, exist_ok=True)
def main():
from chemeleon_fingerprint import CheMeleonFingerprint
p_tr = OUT_DIR / "chemeleon_train.npy"
p_te = OUT_DIR / "chemeleon_test.npy"
if p_tr.exists() and p_te.exists():
print(f" cache hit: {p_tr.name}, {p_te.name}")
a = np.load(p_tr); b = np.load(p_te)
print(f" shapes: train={a.shape} test={b.shape}")
return 0
train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv")
test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv")
print(f"loading CheMeleon...")
fp = CheMeleonFingerprint()
print(f"computing train embeddings ({len(train)})...")
t0 = time.time()
# batch in chunks of 256 to avoid OOM
BATCH = 256
train_embs = []
for i in range(0, len(train), BATCH):
chunk = train["SMILES"].iloc[i:i+BATCH].tolist()
train_embs.append(fp(chunk))
print(f" {i+len(chunk)}/{len(train)} ({(time.time()-t0):.1f}s)")
train_emb = np.vstack(train_embs)
np.save(p_tr, train_emb)
print(f"computing test embeddings ({len(test)})...")
test_embs = []
for i in range(0, len(test), BATCH):
chunk = test["SMILES"].iloc[i:i+BATCH].tolist()
test_embs.append(fp(chunk))
test_emb = np.vstack(test_embs)
np.save(p_te, test_emb)
print(f"saved train={train_emb.shape} test={test_emb.shape} in {time.time()-t0:.1f}s")
return 0
if __name__ == "__main__":
sys.exit(main())