Spaces:
Running
Running
SHAFI commited on
Commit Β·
8ab5f69
1
Parent(s): 0132d3b
improved text extraction and chunking
Browse files- backend.py +129 -19
- chunking_engine.py +135 -0
- evaluator_api.py +10 -0
- file_handlers/universal_parser.py +28 -3
backend.py
CHANGED
|
@@ -196,36 +196,52 @@ class RegexClassifier:
|
|
| 196 |
"""
|
| 197 |
Run selected models on text IN PARALLEL using a thread pool.
|
| 198 |
Each model gets its own timeout. Timed-out / failed models return
|
| 199 |
-
{"detections": [], "error": "...", "timed_out": True}
|
| 200 |
-
caller always gets partial results rather than a total failure.
|
| 201 |
-
|
| 202 |
-
Returns:
|
| 203 |
-
Dict[model_key -> {"detections": List[dict], "error": str|None, "timed_out": bool}]
|
| 204 |
"""
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
| 206 |
"regex": lambda t: self.regex_scanner.scan(t),
|
| 207 |
"nltk": lambda t: self.scan_with_nltk(t),
|
| 208 |
"spacy": lambda t: self.spacy_analyzer.scan(t),
|
| 209 |
"presidio": lambda t: self.presidio_analyzer.scan(t),
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
}
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
def _run_one(key: str):
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
| 216 |
try:
|
| 217 |
-
if key in
|
| 218 |
-
detections =
|
|
|
|
|
|
|
| 219 |
else:
|
| 220 |
model = self._get_lazy_model(key)
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
| 222 |
return key, {"detections": detections, "error": None, "timed_out": False}
|
| 223 |
except Exception as e:
|
| 224 |
print(f"[scan_with_models] Error in '{key}': {e}")
|
| 225 |
return key, {"detections": [], "error": str(e), "timed_out": False}
|
| 226 |
|
| 227 |
results: Dict[str, Any] = {}
|
| 228 |
-
# Use min(len(model_keys), 4) workers β HF Spaces free tier has ~2 vCPUs
|
| 229 |
max_workers = min(len(model_keys), 4)
|
| 230 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
|
| 231 |
future_map = {pool.submit(_run_one, key): key for key in model_keys}
|
|
@@ -295,20 +311,22 @@ class RegexClassifier:
|
|
| 295 |
|
| 296 |
all_matches = []
|
| 297 |
|
|
|
|
|
|
|
| 298 |
# --- Always-On models (no lazy loading needed) ---
|
| 299 |
if "regex" in selected_models: all_matches.extend(self.regex_scanner.scan(text))
|
| 300 |
if "nltk" in selected_models: all_matches.extend(self.scan_with_nltk(text))
|
| 301 |
if "spacy" in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
|
| 302 |
if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
|
| 303 |
-
if "gliner" in selected_models: all_matches.extend(self.gliner_analyzer.scan
|
| 304 |
-
if "deberta" in selected_models: all_matches.extend(self.deberta_analyzer.scan
|
| 305 |
|
| 306 |
# --- Lazy-loaded models (instantiated on first use) ---
|
| 307 |
for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
|
| 308 |
if lazy_key in selected_models:
|
| 309 |
model = self._get_lazy_model(lazy_key)
|
| 310 |
if model:
|
| 311 |
-
all_matches.extend(model.scan
|
| 312 |
|
| 313 |
# Sort and Deduplicate by span overlap (keep longest match)
|
| 314 |
all_matches.sort(key=lambda x: x['start'])
|
|
@@ -334,24 +352,116 @@ class RegexClassifier:
|
|
| 334 |
if not selected_models:
|
| 335 |
selected_models = list(self._ALWAYS_ON)
|
| 336 |
|
|
|
|
|
|
|
| 337 |
model_results: Dict[str, list] = {}
|
| 338 |
|
| 339 |
if "regex" in selected_models: model_results["π οΈ Regex"] = self.regex_scanner.scan(text)
|
| 340 |
if "nltk" in selected_models: model_results["π§ NLTK"] = self.scan_with_nltk(text)
|
| 341 |
if "spacy" in selected_models: model_results["π€ SpaCy"] = self.spacy_analyzer.scan(text)
|
| 342 |
if "presidio" in selected_models: model_results["π‘οΈ Presidio"] = self.presidio_analyzer.scan(text)
|
| 343 |
-
if "gliner" in selected_models: model_results["π¦
GLiNER"] = self.gliner_analyzer.scan
|
| 344 |
-
if "deberta" in selected_models: model_results["π DeBERTa"] = self.deberta_analyzer.scan
|
| 345 |
|
| 346 |
for lazy_key, label in [("pasteproof", "π Pasteproof"), ("piiranha", "π Piiranha"),
|
| 347 |
("nvidia_gliner", "β‘ NVIDIA-GLiNER"), ("mmbert", "π mmbert32k")]:
|
| 348 |
if lazy_key in selected_models:
|
| 349 |
m = self._get_lazy_model(lazy_key)
|
| 350 |
if m:
|
| 351 |
-
model_results[label] = m.scan
|
| 352 |
|
| 353 |
return self.inspector.compare_models_dynamic(model_results)
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
# --- WRAPPERS FOR UI ---
|
| 356 |
def get_json_data(self, file_obj) -> pd.DataFrame:
|
| 357 |
return self.json_handler.read_file(file_obj)
|
|
|
|
| 196 |
"""
|
| 197 |
Run selected models on text IN PARALLEL using a thread pool.
|
| 198 |
Each model gets its own timeout. Timed-out / failed models return
|
| 199 |
+
{"detections": [], "error": "...", "timed_out": True}.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
"""
|
| 201 |
+
from chunking_engine import run_model_with_chunking
|
| 202 |
+
|
| 203 |
+
# Models that do not need chunking (they process the entire string at once)
|
| 204 |
+
always_on_no_chunk = {
|
| 205 |
"regex": lambda t: self.regex_scanner.scan(t),
|
| 206 |
"nltk": lambda t: self.scan_with_nltk(t),
|
| 207 |
"spacy": lambda t: self.spacy_analyzer.scan(t),
|
| 208 |
"presidio": lambda t: self.presidio_analyzer.scan(t),
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
# Models that DO need chunking due to token limits
|
| 212 |
+
always_on_chunked = {
|
| 213 |
+
"gliner": lambda t: run_model_with_chunking(self.gliner_analyzer.scan, t),
|
| 214 |
+
"deberta": lambda t: run_model_with_chunking(self.deberta_analyzer.scan, t),
|
| 215 |
}
|
| 216 |
|
| 217 |
+
# Handle the special 'ensemble' key
|
| 218 |
+
if "ensemble" in model_keys:
|
| 219 |
+
# We don't parallelize ensemble within the pool because it internally calls other models
|
| 220 |
+
# We will handle it separately below or let it run synchronously if it's the only one.
|
| 221 |
+
pass
|
| 222 |
+
|
| 223 |
def _run_one(key: str):
|
| 224 |
+
if key == "ensemble":
|
| 225 |
+
return key, {"detections": self.run_weighted_ensemble(text), "error": None, "timed_out": False}
|
| 226 |
+
|
| 227 |
+
timeout = self._ALWAYS_ON_TIMEOUT if (key in always_on_no_chunk or key in always_on_chunked) else self._LAZY_MODEL_TIMEOUT
|
| 228 |
try:
|
| 229 |
+
if key in always_on_no_chunk:
|
| 230 |
+
detections = always_on_no_chunk[key](text)
|
| 231 |
+
elif key in always_on_chunked:
|
| 232 |
+
detections = always_on_chunked[key](text)
|
| 233 |
else:
|
| 234 |
model = self._get_lazy_model(key)
|
| 235 |
+
if model:
|
| 236 |
+
detections = run_model_with_chunking(model.scan, text)
|
| 237 |
+
else:
|
| 238 |
+
detections = []
|
| 239 |
return key, {"detections": detections, "error": None, "timed_out": False}
|
| 240 |
except Exception as e:
|
| 241 |
print(f"[scan_with_models] Error in '{key}': {e}")
|
| 242 |
return key, {"detections": [], "error": str(e), "timed_out": False}
|
| 243 |
|
| 244 |
results: Dict[str, Any] = {}
|
|
|
|
| 245 |
max_workers = min(len(model_keys), 4)
|
| 246 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
|
| 247 |
future_map = {pool.submit(_run_one, key): key for key in model_keys}
|
|
|
|
| 311 |
|
| 312 |
all_matches = []
|
| 313 |
|
| 314 |
+
from chunking_engine import run_model_with_chunking
|
| 315 |
+
|
| 316 |
# --- Always-On models (no lazy loading needed) ---
|
| 317 |
if "regex" in selected_models: all_matches.extend(self.regex_scanner.scan(text))
|
| 318 |
if "nltk" in selected_models: all_matches.extend(self.scan_with_nltk(text))
|
| 319 |
if "spacy" in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
|
| 320 |
if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
|
| 321 |
+
if "gliner" in selected_models: all_matches.extend(run_model_with_chunking(self.gliner_analyzer.scan, text))
|
| 322 |
+
if "deberta" in selected_models: all_matches.extend(run_model_with_chunking(self.deberta_analyzer.scan, text))
|
| 323 |
|
| 324 |
# --- Lazy-loaded models (instantiated on first use) ---
|
| 325 |
for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
|
| 326 |
if lazy_key in selected_models:
|
| 327 |
model = self._get_lazy_model(lazy_key)
|
| 328 |
if model:
|
| 329 |
+
all_matches.extend(run_model_with_chunking(model.scan, text))
|
| 330 |
|
| 331 |
# Sort and Deduplicate by span overlap (keep longest match)
|
| 332 |
all_matches.sort(key=lambda x: x['start'])
|
|
|
|
| 352 |
if not selected_models:
|
| 353 |
selected_models = list(self._ALWAYS_ON)
|
| 354 |
|
| 355 |
+
from chunking_engine import run_model_with_chunking
|
| 356 |
+
|
| 357 |
model_results: Dict[str, list] = {}
|
| 358 |
|
| 359 |
if "regex" in selected_models: model_results["π οΈ Regex"] = self.regex_scanner.scan(text)
|
| 360 |
if "nltk" in selected_models: model_results["π§ NLTK"] = self.scan_with_nltk(text)
|
| 361 |
if "spacy" in selected_models: model_results["π€ SpaCy"] = self.spacy_analyzer.scan(text)
|
| 362 |
if "presidio" in selected_models: model_results["π‘οΈ Presidio"] = self.presidio_analyzer.scan(text)
|
| 363 |
+
if "gliner" in selected_models: model_results["π¦
GLiNER"] = run_model_with_chunking(self.gliner_analyzer.scan, text)
|
| 364 |
+
if "deberta" in selected_models: model_results["π DeBERTa"] = run_model_with_chunking(self.deberta_analyzer.scan, text)
|
| 365 |
|
| 366 |
for lazy_key, label in [("pasteproof", "π Pasteproof"), ("piiranha", "π Piiranha"),
|
| 367 |
("nvidia_gliner", "β‘ NVIDIA-GLiNER"), ("mmbert", "π mmbert32k")]:
|
| 368 |
if lazy_key in selected_models:
|
| 369 |
m = self._get_lazy_model(lazy_key)
|
| 370 |
if m:
|
| 371 |
+
model_results[label] = run_model_with_chunking(m.scan, text)
|
| 372 |
|
| 373 |
return self.inspector.compare_models_dynamic(model_results)
|
| 374 |
|
| 375 |
+
def run_weighted_ensemble(self, text: str) -> List[dict]:
|
| 376 |
+
"""
|
| 377 |
+
Runs the 'God Algorithm' Weighted Ensemble.
|
| 378 |
+
Combines rule-based (Regex/Presidio) and contextual (DeBERTa/GLiNER) detections,
|
| 379 |
+
weights them, and groups by Intersection-over-Union (IoU) to resolve conflicts.
|
| 380 |
+
"""
|
| 381 |
+
from chunking_engine import run_model_with_chunking, deduplicate_overlapping_entities
|
| 382 |
+
|
| 383 |
+
raw_detections = []
|
| 384 |
+
|
| 385 |
+
# 1. Run all models and assign trust weights based on their architecture
|
| 386 |
+
|
| 387 |
+
# Highly trusted deterministic / rule-based models
|
| 388 |
+
for m in self.regex_scanner.scan(text):
|
| 389 |
+
m["weight"] = 1.0; m["source"] = "Ensemble (Regex)"
|
| 390 |
+
raw_detections.append(m)
|
| 391 |
+
for m in self.presidio_analyzer.scan(text):
|
| 392 |
+
m["weight"] = 0.95; m["source"] = "Ensemble (Presidio)"
|
| 393 |
+
raw_detections.append(m)
|
| 394 |
+
|
| 395 |
+
# Context-aware deep learning models (chunked)
|
| 396 |
+
for m in run_model_with_chunking(self.deberta_analyzer.scan, text):
|
| 397 |
+
m["weight"] = 0.85; m["source"] = "Ensemble (DeBERTa)"
|
| 398 |
+
raw_detections.append(m)
|
| 399 |
+
for m in run_model_with_chunking(self.gliner_analyzer.scan, text):
|
| 400 |
+
m["weight"] = 0.75; m["source"] = "Ensemble (GLiNER)"
|
| 401 |
+
raw_detections.append(m)
|
| 402 |
+
|
| 403 |
+
# Baseline statistical models
|
| 404 |
+
for m in self.spacy_analyzer.scan(text):
|
| 405 |
+
m["weight"] = 0.5; m["source"] = "Ensemble (SpaCy)"
|
| 406 |
+
raw_detections.append(m)
|
| 407 |
+
|
| 408 |
+
if not raw_detections:
|
| 409 |
+
return []
|
| 410 |
+
|
| 411 |
+
# 2. Cluster overlapping detections
|
| 412 |
+
# Sort by start coordinate to make grouping easier
|
| 413 |
+
raw_detections.sort(key=lambda x: x["start"])
|
| 414 |
+
|
| 415 |
+
clusters = []
|
| 416 |
+
current_cluster = [raw_detections[0]]
|
| 417 |
+
|
| 418 |
+
for det in raw_detections[1:]:
|
| 419 |
+
# If the current detection overlaps with the active cluster
|
| 420 |
+
# (i.e. start is before the end of the last item in the cluster)
|
| 421 |
+
if det["start"] <= max(x["end"] for x in current_cluster):
|
| 422 |
+
current_cluster.append(det)
|
| 423 |
+
else:
|
| 424 |
+
clusters.append(current_cluster)
|
| 425 |
+
current_cluster = [det]
|
| 426 |
+
clusters.append(current_cluster)
|
| 427 |
+
|
| 428 |
+
# 3. Resolve conflicts within each cluster
|
| 429 |
+
final_detections = []
|
| 430 |
+
for cluster in clusters:
|
| 431 |
+
if len(cluster) == 1:
|
| 432 |
+
final_detections.append(cluster[0])
|
| 433 |
+
continue
|
| 434 |
+
|
| 435 |
+
# Aggregate weights by label
|
| 436 |
+
label_weights = {}
|
| 437 |
+
for det in cluster:
|
| 438 |
+
lbl = det["label"]
|
| 439 |
+
w = det["weight"]
|
| 440 |
+
label_weights[lbl] = label_weights.get(lbl, 0) + w
|
| 441 |
+
|
| 442 |
+
# Pick the winning label
|
| 443 |
+
winning_label = max(label_weights.items(), key=lambda x: x[1])[0]
|
| 444 |
+
|
| 445 |
+
# Find the detection in this cluster that has the winning label and highest individual weight
|
| 446 |
+
candidates = [c for c in cluster if c["label"] == winning_label]
|
| 447 |
+
if not candidates:
|
| 448 |
+
candidates = cluster # fallback, shouldn't happen
|
| 449 |
+
|
| 450 |
+
best_det = max(candidates, key=lambda x: x["weight"])
|
| 451 |
+
|
| 452 |
+
# Optionally, expand boundaries to encompass the maximum matched area
|
| 453 |
+
min_start = min(c["start"] for c in cluster if c["label"] == winning_label)
|
| 454 |
+
max_end = max(c["end"] for c in cluster if c["label"] == winning_label)
|
| 455 |
+
best_det["start"] = min_start
|
| 456 |
+
best_det["end"] = max_end
|
| 457 |
+
best_det["text"] = text[min_start:max_end]
|
| 458 |
+
|
| 459 |
+
final_detections.append(best_det)
|
| 460 |
+
|
| 461 |
+
# 4. Final IoU Deduplication to clean up any remaining sloppy edges
|
| 462 |
+
return deduplicate_overlapping_entities(final_detections, iou_threshold=0.3)
|
| 463 |
+
|
| 464 |
+
|
| 465 |
# --- WRAPPERS FOR UI ---
|
| 466 |
def get_json_data(self, file_obj) -> pd.DataFrame:
|
| 467 |
return self.json_handler.read_file(file_obj)
|
chunking_engine.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Segmento Chunker
|
| 3 |
+
Implements Presidio's sliding-window chunking logic for deep learning NLP models.
|
| 4 |
+
Prevents the 512-token limit truncation and deduplicates overlapping entities using IoU.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import List, Dict, Any, Callable, Tuple
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger("segmento.chunker")
|
| 11 |
+
|
| 12 |
+
class TextChunk:
|
| 13 |
+
def __init__(self, text: str, start: int, end: int):
|
| 14 |
+
self.text = text
|
| 15 |
+
self.start = start
|
| 16 |
+
self.end = end
|
| 17 |
+
|
| 18 |
+
class SegmentoChunker:
|
| 19 |
+
"""Character-based text chunker with word boundary preservation."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200):
|
| 22 |
+
self.chunk_size = chunk_size
|
| 23 |
+
self.chunk_overlap = chunk_overlap
|
| 24 |
+
self.boundary_chars = (" ", "\n", "\t")
|
| 25 |
+
|
| 26 |
+
def chunk(self, text: str) -> List[TextChunk]:
|
| 27 |
+
"""Split text into overlapping chunks, extending to the nearest word boundary."""
|
| 28 |
+
if not text:
|
| 29 |
+
return []
|
| 30 |
+
|
| 31 |
+
chunks = []
|
| 32 |
+
start = 0
|
| 33 |
+
|
| 34 |
+
while start < len(text):
|
| 35 |
+
end = min(start + self.chunk_size, len(text))
|
| 36 |
+
|
| 37 |
+
# Extend to complete word boundary
|
| 38 |
+
while end < len(text) and text[end] not in self.boundary_chars:
|
| 39 |
+
end += 1
|
| 40 |
+
|
| 41 |
+
chunks.append(TextChunk(text=text[start:end], start=start, end=end))
|
| 42 |
+
|
| 43 |
+
if end >= len(text):
|
| 44 |
+
break
|
| 45 |
+
|
| 46 |
+
# Slide window forward by (size - overlap)
|
| 47 |
+
# Actually, standard Presidio does: start = end - chunk_overlap
|
| 48 |
+
start = end - self.chunk_overlap
|
| 49 |
+
|
| 50 |
+
return chunks
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _calculate_iou(start1: int, end1: int, start2: int, end2: int) -> float:
|
| 54 |
+
"""Calculate Intersection-over-Union for two text spans."""
|
| 55 |
+
intersection = max(0, min(end1, end2) - max(start1, start2))
|
| 56 |
+
if intersection == 0:
|
| 57 |
+
return 0.0
|
| 58 |
+
union = (end1 - start1) + (end2 - start2) - intersection
|
| 59 |
+
return intersection / union if union > 0 else 0.0
|
| 60 |
+
|
| 61 |
+
def deduplicate_overlapping_entities(entities: List[Dict[str, Any]], iou_threshold: float = 0.5) -> List[Dict[str, Any]]:
|
| 62 |
+
"""
|
| 63 |
+
Remove duplicate entities that overlap across chunks.
|
| 64 |
+
Keeps the entity with the highest score.
|
| 65 |
+
"""
|
| 66 |
+
if not entities:
|
| 67 |
+
return []
|
| 68 |
+
|
| 69 |
+
# Sort by score descending so we keep the highest confidence matches
|
| 70 |
+
# If score is missing, default to 1.0
|
| 71 |
+
entities = sorted(entities, key=lambda x: x.get('score', 1.0), reverse=True)
|
| 72 |
+
|
| 73 |
+
unique_entities = []
|
| 74 |
+
|
| 75 |
+
for entity in entities:
|
| 76 |
+
is_duplicate = False
|
| 77 |
+
for unique_ent in unique_entities:
|
| 78 |
+
# If same label and high IoU overlap
|
| 79 |
+
if entity.get("label") == unique_ent.get("label"):
|
| 80 |
+
iou = _calculate_iou(
|
| 81 |
+
entity["start"], entity["end"],
|
| 82 |
+
unique_ent["start"], unique_ent["end"]
|
| 83 |
+
)
|
| 84 |
+
if iou >= iou_threshold:
|
| 85 |
+
is_duplicate = True
|
| 86 |
+
break
|
| 87 |
+
|
| 88 |
+
if not is_duplicate:
|
| 89 |
+
unique_entities.append(entity)
|
| 90 |
+
|
| 91 |
+
# Sort back by original text position
|
| 92 |
+
unique_entities = sorted(unique_entities, key=lambda x: x["start"])
|
| 93 |
+
return unique_entities
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def run_model_with_chunking(
|
| 97 |
+
scan_func: Callable[[str], List[Dict[str, Any]]],
|
| 98 |
+
text: str,
|
| 99 |
+
chunk_size: int = 2000,
|
| 100 |
+
overlap: int = 200
|
| 101 |
+
) -> List[Dict[str, Any]]:
|
| 102 |
+
"""
|
| 103 |
+
Wrapper function to pass large text through a Deep Learning model safely.
|
| 104 |
+
Splits text, runs the scan_func on each chunk, maps coordinates back, and deduplicates.
|
| 105 |
+
"""
|
| 106 |
+
if not text:
|
| 107 |
+
return []
|
| 108 |
+
|
| 109 |
+
chunker = SegmentoChunker(chunk_size=chunk_size, chunk_overlap=overlap)
|
| 110 |
+
chunks = chunker.chunk(text)
|
| 111 |
+
|
| 112 |
+
all_detections = []
|
| 113 |
+
|
| 114 |
+
for chunk in chunks:
|
| 115 |
+
# Run inference on the chunk text
|
| 116 |
+
try:
|
| 117 |
+
chunk_results = scan_func(chunk.text)
|
| 118 |
+
|
| 119 |
+
for res in chunk_results:
|
| 120 |
+
# Map local coordinates to absolute document coordinates
|
| 121 |
+
absolute_start = res["start"] + chunk.start
|
| 122 |
+
absolute_end = res["end"] + chunk.start
|
| 123 |
+
|
| 124 |
+
mapped_res = res.copy()
|
| 125 |
+
mapped_res["start"] = absolute_start
|
| 126 |
+
mapped_res["end"] = absolute_end
|
| 127 |
+
|
| 128 |
+
all_detections.append(mapped_res)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.error(f"Error scanning chunk: {e}")
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Deduplicate overlapping entities caused by the sliding window
|
| 134 |
+
deduplicated = deduplicate_overlapping_entities(all_detections, iou_threshold=0.5)
|
| 135 |
+
return deduplicated
|
evaluator_api.py
CHANGED
|
@@ -39,6 +39,16 @@ def setup(classifier_instance):
|
|
| 39 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
|
| 41 |
MODEL_CATALOGUE = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
{
|
| 43 |
"key": "regex",
|
| 44 |
"label": "π οΈ Regex Engine",
|
|
|
|
| 39 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
|
| 41 |
MODEL_CATALOGUE = [
|
| 42 |
+
{
|
| 43 |
+
"key": "ensemble",
|
| 44 |
+
"label": "π Ensemble (God Algorithm)",
|
| 45 |
+
"hf_id": "hybrid",
|
| 46 |
+
"type": "God Algorithm",
|
| 47 |
+
"params": "IoU",
|
| 48 |
+
"f1_benchmark": 0.99,
|
| 49 |
+
"lazy": False,
|
| 50 |
+
"description": "Aggregates all models using sliding windows and weighted IoU deduplication.",
|
| 51 |
+
},
|
| 52 |
{
|
| 53 |
"key": "regex",
|
| 54 |
"label": "π οΈ Regex Engine",
|
file_handlers/universal_parser.py
CHANGED
|
@@ -59,6 +59,29 @@ CATEGORY_FILE_TYPES = {
|
|
| 59 |
],
|
| 60 |
}
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
def parse_file(file_bytes: bytes, file_type: str) -> str:
|
| 64 |
"""
|
|
@@ -77,11 +100,13 @@ def parse_file(file_bytes: bytes, file_type: str) -> str:
|
|
| 77 |
if parser_fn is None:
|
| 78 |
# Best-effort: try UTF-8 text decode
|
| 79 |
try:
|
| 80 |
-
|
| 81 |
except Exception:
|
| 82 |
return f"[No parser for file type: {file_type}]"
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
def get_all_categories() -> dict:
|
|
|
|
| 59 |
],
|
| 60 |
}
|
| 61 |
|
| 62 |
+
import re
|
| 63 |
+
|
| 64 |
+
def normalize_text(text: str) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Cleans extracted text to improve NLP processing accuracy.
|
| 67 |
+
- Removes zero-width characters and invisible control characters.
|
| 68 |
+
- Normalizes repeated whitespaces (except newlines).
|
| 69 |
+
- Repairs broken sentences caused by PDF layout extraction.
|
| 70 |
+
"""
|
| 71 |
+
if not text:
|
| 72 |
+
return ""
|
| 73 |
+
|
| 74 |
+
# 1. Remove zero-width characters and non-printable control chars (excluding \n, \t)
|
| 75 |
+
text = re.sub(r'[\u200b\u200c\u200d\uFEFF\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
| 76 |
+
|
| 77 |
+
# 2. Collapse multiple spaces/tabs into a single space
|
| 78 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 79 |
+
|
| 80 |
+
# 3. Repair broken PDF lines: If a line does not end with punctuation or is short, join it.
|
| 81 |
+
# We replace a newline that is preceded by a lowercase letter and followed by a letter with a space.
|
| 82 |
+
text = re.sub(r'([a-z])\n([A-Za-z])', r'\1 \2', text)
|
| 83 |
+
|
| 84 |
+
return text.strip()
|
| 85 |
|
| 86 |
def parse_file(file_bytes: bytes, file_type: str) -> str:
|
| 87 |
"""
|
|
|
|
| 100 |
if parser_fn is None:
|
| 101 |
# Best-effort: try UTF-8 text decode
|
| 102 |
try:
|
| 103 |
+
raw_text = file_bytes.decode("utf-8", errors="replace")
|
| 104 |
except Exception:
|
| 105 |
return f"[No parser for file type: {file_type}]"
|
| 106 |
+
else:
|
| 107 |
+
raw_text = parser_fn(file_bytes)
|
| 108 |
+
|
| 109 |
+
return normalize_text(raw_text)
|
| 110 |
|
| 111 |
|
| 112 |
def get_all_categories() -> dict:
|