"""Compute Mordred 2D descriptors for train + test. Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv Writes: data/processed/features/mordred_train.parquet, mordred_test.parquet """ from __future__ import annotations import sys, time from pathlib import Path import numpy as np import pandas as pd from rdkit import Chem, RDLogger from mordred import Calculator, descriptors RDLogger.DisableLog("rdApp.*") EXP_ROOT = Path(__file__).resolve().parent.parent OUT_DIR = EXP_ROOT / "data" / "processed" / "features" OUT_DIR.mkdir(parents=True, exist_ok=True) def compute_mordred(smiles_list: list[str]) -> pd.DataFrame: calc = Calculator(descriptors, ignore_3D=True) mols = [Chem.MolFromSmiles(s) for s in smiles_list] # mordred handles parallelism internally if nproc set df = calc.pandas(mols, nproc=8, quiet=True) return df def main(): p_tr = OUT_DIR / "mordred_train.parquet" p_te = OUT_DIR / "mordred_test.parquet" if p_tr.exists() and p_te.exists(): a = pd.read_parquet(p_tr) b = pd.read_parquet(p_te) print(f" cache hit: {p_tr.name} ({a.shape}), {p_te.name} ({b.shape})") return 0 train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv") test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv") t0 = time.time() print(f"computing Mordred on {len(train)} train compounds...") df_tr = compute_mordred(train["SMILES"].tolist()) print(f" train done in {time.time()-t0:.1f}s, shape={df_tr.shape}") t1 = time.time() print(f"computing Mordred on {len(test)} test compounds...") df_te = compute_mordred(test["SMILES"].tolist()) print(f" test done in {time.time()-t1:.1f}s, shape={df_te.shape}") # Cast object columns (failed descriptors stored as Mordred Missing) to NaN→float for df in (df_tr, df_te): for c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce") df_tr.to_parquet(p_tr, index=False) df_te.to_parquet(p_te, index=False) print(f"saved. total {time.time()-t0:.1f}s") return 0 if __name__ == "__main__": sys.exit(main())