ai-talent-finder-backend / scripts /build_feature_store.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
3.88 kB
#!/usr/bin/env python3
"""Build feature store: BOW / TF-IDF + optional SVD from extracted JSONL.
Usage:
PYTHONPATH=backend python backend/scripts/build_feature_store.py --input data/extracted_full.jsonl --out models/feature_meta_tfidf.joblib --method tfidf --max-features 10000 --svd-components 100
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import List
import joblib
try:
from app.services.cv_extractor import CVExtractionService
except Exception:
CVExtractionService = None
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
def read_file_paths(jsonl_path: Path) -> List[Path]:
files = []
with jsonl_path.open("r", encoding="utf-8") as fh:
for line in fh:
try:
rec = json.loads(line)
f = rec.get("file")
if f:
files.append(Path(f))
except Exception:
continue
return files
def extract_texts(file_paths: List[Path]):
texts = []
service = CVExtractionService() if CVExtractionService is not None else None
for p in file_paths:
try:
if p.suffix.lower() == ".txt":
texts.append(p.read_text(encoding="utf-8", errors="ignore"))
else:
if service is not None:
res = service.extract_from_pdf(str(p))
texts.append(res.raw_text or "")
else:
texts.append(p.read_text(encoding="utf-8", errors="ignore"))
except Exception:
texts.append("")
return texts
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="extracted JSONL with file paths")
parser.add_argument("--out", required=True, help="output joblib path for feature_meta")
parser.add_argument("--method", choices=["tfidf", "bow", "both"], default="tfidf")
parser.add_argument("--max-features", type=int, default=10000)
parser.add_argument("--svd-components", type=int, default=100)
parser.add_argument("--limit", type=int, default=0, help="limit number of files (0 = all)")
args = parser.parse_args(argv)
jsonl = Path(args.input)
files = read_file_paths(jsonl)
if args.limit and args.limit > 0:
files = files[: args.limit]
if not files:
print("No files found to build feature store")
return 2
print(f"Found {len(files)} files, extracting texts...")
texts = extract_texts(files)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
meta = {"method": args.method, "n_files": len(files)}
if args.method in ("tfidf", "both"):
print("Fitting TfidfVectorizer...")
tfidf = TfidfVectorizer(max_features=args.max_features, ngram_range=(1,2))
tfidf.fit(texts)
meta["tfidf"] = tfidf
# Fit SVD on TF-IDF matrix
try:
X = tfidf.transform(texts)
n_comp = min(args.svd_components, max(2, X.shape[1] - 1))
if n_comp > 0:
print(f"Fitting TruncatedSVD with n_components={n_comp}...")
svd = TruncatedSVD(n_components=n_comp)
svd.fit(X)
meta["svd"] = svd
else:
meta["svd"] = None
except Exception as exc:
print(f"SVD fit failed: {exc}")
meta["svd"] = None
if args.method in ("bow", "both"):
print("Fitting CountVectorizer (BOW)...")
bow = CountVectorizer(max_features=args.max_features)
bow.fit(texts)
meta["bow"] = bow
joblib.dump(meta, out_path)
print(f"Saved feature meta to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())