File size: 7,636 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c53f53f
 
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c53f53f
 
 
 
 
 
 
 
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c53f53f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from __future__ import annotations

import importlib.util
import logging
import os
import shutil
from datetime import datetime
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)

_DEFAULT_REQUIRED_FEATURES = [
    "cv_text_extraction",
    "semantic_matching",
]

_CAPABILITIES_CACHE: Optional[Dict[str, object]] = None


def _has_module(module_name: str) -> bool:
    return importlib.util.find_spec(module_name) is not None


def _env_set(name: str) -> bool:
    return bool(os.getenv(name))


def _env_bool(name: str, default: bool = False) -> bool:
    raw = os.getenv(name)
    if raw is None:
        return default
    return str(raw).strip().lower() in {"1", "true", "yes", "on"}


def _resolve_tesseract_path() -> Optional[str]:
    cmd = os.getenv("TESSERACT_CMD", "").strip()
    if cmd:
        found = shutil.which(cmd)
        return found
    return shutil.which("tesseract")


def _parse_required_features() -> List[str]:
    raw = os.getenv("AI_FEATURES_REQUIRED", "")
    if not raw:
        return []
    return [item.strip() for item in raw.split(",") if item.strip()]


def _feature_status(
    required: Dict[str, bool],
    optional: Optional[Dict[str, bool]] = None,
    notes: str | None = None,
) -> Dict[str, object]:
    optional = optional or {}
    missing_required = [name for name, ok in required.items() if not ok]
    missing_optional = [name for name, ok in optional.items() if not ok]

    available = not missing_required
    status = "ok" if available else "missing"
    if available and missing_optional:
        status = "degraded"

    return {
        "available": available,
        "status": status,
        "required_missing": missing_required,
        "optional_missing": missing_optional,
        "notes": notes or "",
    }


def detect_capabilities() -> Dict[str, object]:
    use_ai_profile = _env_bool("USE_AI_PROFILE_GENERATOR", default=False)
    local_llm_enabled = bool(os.getenv("LOCAL_LLM_BASE_URL", "").strip())
    deps = {
        "fitz": _has_module("fitz"),
        "pdfplumber": _has_module("pdfplumber"),
        "pytesseract": _has_module("pytesseract"),
        "pillow": _has_module("PIL"),
        "transformers": _has_module("transformers"),
        "torch": _has_module("torch"),
        "sentence_transformers": _has_module("sentence_transformers"),
        "faiss": _has_module("faiss"),
        "numpy": _has_module("numpy"),
        "openpyxl": _has_module("openpyxl"),
        "reportlab": _has_module("reportlab"),
        "anthropic": _has_module("anthropic"),
    }

    tesseract_path = _resolve_tesseract_path()
    deps["tesseract_binary"] = bool(tesseract_path)

    api_keys = {
        "ANTHROPIC_API_KEY": _env_set("ANTHROPIC_API_KEY"),
        "OPENAI_API_KEY": _env_set("OPENAI_API_KEY"),
        "HUGGINGFACE_API_KEY": _env_set("HUGGINGFACE_API_KEY"),
        "HF_TOKEN_CHATBOT": _env_set("HF_TOKEN_CHATBOT"),
        "LOCAL_LLM_BASE_URL": _env_set("LOCAL_LLM_BASE_URL"),
    }

    features = {
        "cv_text_extraction": _feature_status(
            required={"fitz": deps["fitz"]},
            optional={"pdfplumber": deps["pdfplumber"]},
            notes="PyMuPDF is required for PDF text extraction.",
        ),
        "cv_ocr": _feature_status(
            required={
                "fitz": deps["fitz"],
                "pytesseract": deps["pytesseract"],
                "pillow": deps["pillow"],
                "tesseract_binary": deps["tesseract_binary"],
            },
            notes="OCR requires the Tesseract binary and PIL.",
        ),
        "ner_hf": _feature_status(
            required={"transformers": deps["transformers"], "torch": deps["torch"]},
            notes="If missing, regex-based NER is still available.",
        ),
        "semantic_matching": _feature_status(
            required={
                "sentence_transformers": deps["sentence_transformers"],
                "numpy": deps["numpy"],
                "torch": deps["torch"],
            },
            optional={"faiss": deps["faiss"]},
            notes="If missing, matching falls back to heuristic scoring.",
        ),
        "export": _feature_status(
            required={"openpyxl": deps["openpyxl"], "reportlab": deps["reportlab"]},
            notes="If missing, export endpoints are disabled.",
        ),
        "chat_llm": _feature_status(
            required={
                "llm_provider": (
                    api_keys["ANTHROPIC_API_KEY"]
                    or api_keys["HF_TOKEN_CHATBOT"]
                    or api_keys["LOCAL_LLM_BASE_URL"]
                )
            },
            notes="Disponible si un provider LLM est configure (Anthropic, HF Inference, ou LLM local). Sinon, reponses deterministes.",
        ),
        "profile_generator": _feature_status(
            required={"transformers": deps["transformers"], "torch": deps["torch"]}
            if use_ai_profile
            else {},
            notes="If disabled or missing deps, rule-based profile generation is used.",
        ),
    }

    return {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "strict": _env_set("AI_FEATURES_STRICT"),
        "required_features": _parse_required_features(),
        "dependencies": deps,
        "api_keys": api_keys,
        "flags": {
            "USE_AI_PROFILE_GENERATOR": use_ai_profile,
            "LOCAL_LLM_ENABLED": local_llm_enabled,
        },
        "features": features,
        "tesseract_path": tesseract_path,
        "tesseract_cmd": os.getenv("TESSERACT_CMD", "").strip() or None,
    }


def get_capabilities(force_refresh: bool = False) -> Dict[str, object]:
    global _CAPABILITIES_CACHE
    if _CAPABILITIES_CACHE is None or force_refresh:
        _CAPABILITIES_CACHE = detect_capabilities()
    return _CAPABILITIES_CACHE


def log_capabilities_summary(capabilities: Optional[Dict[str, object]] = None) -> Dict[str, object]:
    cap = capabilities or get_capabilities()
    features = cap.get("features", {})

    status_counts = {"ok": 0, "degraded": 0, "missing": 0}
    for detail in features.values():
        status = str(detail.get("status", "missing"))
        status_counts[status] = status_counts.get(status, 0) + 1

    logger.info(
        "AI capabilities: ok=%s degraded=%s missing=%s",
        status_counts.get("ok", 0),
        status_counts.get("degraded", 0),
        status_counts.get("missing", 0),
    )

    for name, detail in sorted(features.items()):
        status = detail.get("status")
        if status == "ok":
            continue
        logger.warning(
            "AI capability %s: %s (required_missing=%s optional_missing=%s)",
            name,
            status,
            detail.get("required_missing"),
            detail.get("optional_missing"),
        )

    return cap


def assert_required_features(capabilities: Optional[Dict[str, object]] = None) -> None:
    cap = capabilities or get_capabilities()
    strict = bool(cap.get("strict"))
    if not strict:
        return

    required = list(cap.get("required_features") or [])
    if not required:
        required = list(_DEFAULT_REQUIRED_FEATURES)
        logger.warning(
            "AI_FEATURES_STRICT is enabled without AI_FEATURES_REQUIRED; using defaults: %s",
            ", ".join(required),
        )

    features = cap.get("features", {})
    missing = [name for name in required if not features.get(name, {}).get("available")]
    if not missing:
        return

    logger.error("Missing required AI features: %s", ", ".join(missing))
    raise RuntimeError(f"Missing required AI features: {', '.join(missing)}")