| """Compute Mordred 2D descriptors for train + test. |
| |
| Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv |
| Writes: data/processed/features/mordred_train.parquet, mordred_test.parquet |
| """ |
| from __future__ import annotations |
| import sys, time |
| from pathlib import Path |
| import numpy as np |
| import pandas as pd |
| from rdkit import Chem, RDLogger |
| from mordred import Calculator, descriptors |
|
|
| RDLogger.DisableLog("rdApp.*") |
|
|
| EXP_ROOT = Path(__file__).resolve().parent.parent |
| OUT_DIR = EXP_ROOT / "data" / "processed" / "features" |
| OUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def compute_mordred(smiles_list: list[str]) -> pd.DataFrame: |
| calc = Calculator(descriptors, ignore_3D=True) |
| mols = [Chem.MolFromSmiles(s) for s in smiles_list] |
| |
| df = calc.pandas(mols, nproc=8, quiet=True) |
| return df |
|
|
|
|
| def main(): |
| p_tr = OUT_DIR / "mordred_train.parquet" |
| p_te = OUT_DIR / "mordred_test.parquet" |
| if p_tr.exists() and p_te.exists(): |
| a = pd.read_parquet(p_tr) |
| b = pd.read_parquet(p_te) |
| print(f" cache hit: {p_tr.name} ({a.shape}), {p_te.name} ({b.shape})") |
| return 0 |
|
|
| train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv") |
| test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv") |
|
|
| t0 = time.time() |
| print(f"computing Mordred on {len(train)} train compounds...") |
| df_tr = compute_mordred(train["SMILES"].tolist()) |
| print(f" train done in {time.time()-t0:.1f}s, shape={df_tr.shape}") |
|
|
| t1 = time.time() |
| print(f"computing Mordred on {len(test)} test compounds...") |
| df_te = compute_mordred(test["SMILES"].tolist()) |
| print(f" test done in {time.time()-t1:.1f}s, shape={df_te.shape}") |
|
|
| |
| for df in (df_tr, df_te): |
| for c in df.columns: |
| df[c] = pd.to_numeric(df[c], errors="coerce") |
| df_tr.to_parquet(p_tr, index=False) |
| df_te.to_parquet(p_te, index=False) |
| print(f"saved. total {time.time()-t0:.1f}s") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|