| |
| """Build feature store: BOW / TF-IDF + optional SVD from extracted JSONL. |
| |
| Usage: |
| PYTHONPATH=backend python backend/scripts/build_feature_store.py --input data/extracted_full.jsonl --out models/feature_meta_tfidf.joblib --method tfidf --max-features 10000 --svd-components 100 |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| from typing import List |
|
|
| import joblib |
|
|
| try: |
| from app.services.cv_extractor import CVExtractionService |
| except Exception: |
| CVExtractionService = None |
|
|
| from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
| from sklearn.decomposition import TruncatedSVD |
|
|
|
|
| def read_file_paths(jsonl_path: Path) -> List[Path]: |
| files = [] |
| with jsonl_path.open("r", encoding="utf-8") as fh: |
| for line in fh: |
| try: |
| rec = json.loads(line) |
| f = rec.get("file") |
| if f: |
| files.append(Path(f)) |
| except Exception: |
| continue |
| return files |
|
|
|
|
| def extract_texts(file_paths: List[Path]): |
| texts = [] |
| service = CVExtractionService() if CVExtractionService is not None else None |
| for p in file_paths: |
| try: |
| if p.suffix.lower() == ".txt": |
| texts.append(p.read_text(encoding="utf-8", errors="ignore")) |
| else: |
| if service is not None: |
| res = service.extract_from_pdf(str(p)) |
| texts.append(res.raw_text or "") |
| else: |
| texts.append(p.read_text(encoding="utf-8", errors="ignore")) |
| except Exception: |
| texts.append("") |
| return texts |
|
|
|
|
| def main(argv=None): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--input", required=True, help="extracted JSONL with file paths") |
| parser.add_argument("--out", required=True, help="output joblib path for feature_meta") |
| parser.add_argument("--method", choices=["tfidf", "bow", "both"], default="tfidf") |
| parser.add_argument("--max-features", type=int, default=10000) |
| parser.add_argument("--svd-components", type=int, default=100) |
| parser.add_argument("--limit", type=int, default=0, help="limit number of files (0 = all)") |
| args = parser.parse_args(argv) |
|
|
| jsonl = Path(args.input) |
| files = read_file_paths(jsonl) |
| if args.limit and args.limit > 0: |
| files = files[: args.limit] |
| if not files: |
| print("No files found to build feature store") |
| return 2 |
|
|
| print(f"Found {len(files)} files, extracting texts...") |
| texts = extract_texts(files) |
|
|
| out_path = Path(args.out) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| meta = {"method": args.method, "n_files": len(files)} |
|
|
| if args.method in ("tfidf", "both"): |
| print("Fitting TfidfVectorizer...") |
| tfidf = TfidfVectorizer(max_features=args.max_features, ngram_range=(1,2)) |
| tfidf.fit(texts) |
| meta["tfidf"] = tfidf |
| |
| try: |
| X = tfidf.transform(texts) |
| n_comp = min(args.svd_components, max(2, X.shape[1] - 1)) |
| if n_comp > 0: |
| print(f"Fitting TruncatedSVD with n_components={n_comp}...") |
| svd = TruncatedSVD(n_components=n_comp) |
| svd.fit(X) |
| meta["svd"] = svd |
| else: |
| meta["svd"] = None |
| except Exception as exc: |
| print(f"SVD fit failed: {exc}") |
| meta["svd"] = None |
|
|
| if args.method in ("bow", "both"): |
| print("Fitting CountVectorizer (BOW)...") |
| bow = CountVectorizer(max_features=args.max_features) |
| bow.fit(texts) |
| meta["bow"] = bow |
|
|
| joblib.dump(meta, out_path) |
| print(f"Saved feature meta to {out_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|