File size: 3,876 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
"""Build feature store: BOW / TF-IDF + optional SVD from extracted JSONL.

Usage:
  PYTHONPATH=backend python backend/scripts/build_feature_store.py --input data/extracted_full.jsonl --out models/feature_meta_tfidf.joblib --method tfidf --max-features 10000 --svd-components 100
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import List

import joblib

try:
    from app.services.cv_extractor import CVExtractionService
except Exception:
    CVExtractionService = None

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD


def read_file_paths(jsonl_path: Path) -> List[Path]:
    files = []
    with jsonl_path.open("r", encoding="utf-8") as fh:
        for line in fh:
            try:
                rec = json.loads(line)
                f = rec.get("file")
                if f:
                    files.append(Path(f))
            except Exception:
                continue
    return files


def extract_texts(file_paths: List[Path]):
    texts = []
    service = CVExtractionService() if CVExtractionService is not None else None
    for p in file_paths:
        try:
            if p.suffix.lower() == ".txt":
                texts.append(p.read_text(encoding="utf-8", errors="ignore"))
            else:
                if service is not None:
                    res = service.extract_from_pdf(str(p))
                    texts.append(res.raw_text or "")
                else:
                    texts.append(p.read_text(encoding="utf-8", errors="ignore"))
        except Exception:
            texts.append("")
    return texts


def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", required=True, help="extracted JSONL with file paths")
    parser.add_argument("--out", required=True, help="output joblib path for feature_meta")
    parser.add_argument("--method", choices=["tfidf", "bow", "both"], default="tfidf")
    parser.add_argument("--max-features", type=int, default=10000)
    parser.add_argument("--svd-components", type=int, default=100)
    parser.add_argument("--limit", type=int, default=0, help="limit number of files (0 = all)")
    args = parser.parse_args(argv)

    jsonl = Path(args.input)
    files = read_file_paths(jsonl)
    if args.limit and args.limit > 0:
        files = files[: args.limit]
    if not files:
        print("No files found to build feature store")
        return 2

    print(f"Found {len(files)} files, extracting texts...")
    texts = extract_texts(files)

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    meta = {"method": args.method, "n_files": len(files)}

    if args.method in ("tfidf", "both"):
        print("Fitting TfidfVectorizer...")
        tfidf = TfidfVectorizer(max_features=args.max_features, ngram_range=(1,2))
        tfidf.fit(texts)
        meta["tfidf"] = tfidf
        # Fit SVD on TF-IDF matrix
        try:
            X = tfidf.transform(texts)
            n_comp = min(args.svd_components, max(2, X.shape[1] - 1))
            if n_comp > 0:
                print(f"Fitting TruncatedSVD with n_components={n_comp}...")
                svd = TruncatedSVD(n_components=n_comp)
                svd.fit(X)
                meta["svd"] = svd
            else:
                meta["svd"] = None
        except Exception as exc:
            print(f"SVD fit failed: {exc}")
            meta["svd"] = None

    if args.method in ("bow", "both"):
        print("Fitting CountVectorizer (BOW)...")
        bow = CountVectorizer(max_features=args.max_features)
        bow.fit(texts)
        meta["bow"] = bow

    joblib.dump(meta, out_path)
    print(f"Saved feature meta to {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())