RyeCatcher's picture
Add files using upload-large-folder tool
1757924 verified
"""Compute Mordred 2D descriptors for train + test.
Reads: data/raw/pxr-challenge_TRAIN.csv, pxr-challenge_TEST_BLINDED.csv
Writes: data/processed/features/mordred_train.parquet, mordred_test.parquet
"""
from __future__ import annotations
import sys, time
from pathlib import Path
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from mordred import Calculator, descriptors
RDLogger.DisableLog("rdApp.*")
EXP_ROOT = Path(__file__).resolve().parent.parent
OUT_DIR = EXP_ROOT / "data" / "processed" / "features"
OUT_DIR.mkdir(parents=True, exist_ok=True)
def compute_mordred(smiles_list: list[str]) -> pd.DataFrame:
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(s) for s in smiles_list]
# mordred handles parallelism internally if nproc set
df = calc.pandas(mols, nproc=8, quiet=True)
return df
def main():
p_tr = OUT_DIR / "mordred_train.parquet"
p_te = OUT_DIR / "mordred_test.parquet"
if p_tr.exists() and p_te.exists():
a = pd.read_parquet(p_tr)
b = pd.read_parquet(p_te)
print(f" cache hit: {p_tr.name} ({a.shape}), {p_te.name} ({b.shape})")
return 0
train = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TRAIN.csv")
test = pd.read_csv(EXP_ROOT / "data" / "raw" / "pxr-challenge_TEST_BLINDED.csv")
t0 = time.time()
print(f"computing Mordred on {len(train)} train compounds...")
df_tr = compute_mordred(train["SMILES"].tolist())
print(f" train done in {time.time()-t0:.1f}s, shape={df_tr.shape}")
t1 = time.time()
print(f"computing Mordred on {len(test)} test compounds...")
df_te = compute_mordred(test["SMILES"].tolist())
print(f" test done in {time.time()-t1:.1f}s, shape={df_te.shape}")
# Cast object columns (failed descriptors stored as Mordred Missing) to NaN→float
for df in (df_tr, df_te):
for c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
df_tr.to_parquet(p_tr, index=False)
df_te.to_parquet(p_te, index=False)
print(f"saved. total {time.time()-t0:.1f}s")
return 0
if __name__ == "__main__":
sys.exit(main())