SHAFI commited on
Commit
8ab5f69
Β·
1 Parent(s): 0132d3b

improved text extraction and chunking

Browse files
backend.py CHANGED
@@ -196,36 +196,52 @@ class RegexClassifier:
196
  """
197
  Run selected models on text IN PARALLEL using a thread pool.
198
  Each model gets its own timeout. Timed-out / failed models return
199
- {"detections": [], "error": "...", "timed_out": True} so the
200
- caller always gets partial results rather than a total failure.
201
-
202
- Returns:
203
- Dict[model_key -> {"detections": List[dict], "error": str|None, "timed_out": bool}]
204
  """
205
- always_on = {
 
 
 
206
  "regex": lambda t: self.regex_scanner.scan(t),
207
  "nltk": lambda t: self.scan_with_nltk(t),
208
  "spacy": lambda t: self.spacy_analyzer.scan(t),
209
  "presidio": lambda t: self.presidio_analyzer.scan(t),
210
- "gliner": lambda t: self.gliner_analyzer.scan(t),
211
- "deberta": lambda t: self.deberta_analyzer.scan(t),
 
 
 
 
212
  }
213
 
 
 
 
 
 
 
214
  def _run_one(key: str):
215
- timeout = self._ALWAYS_ON_TIMEOUT if key in always_on else self._LAZY_MODEL_TIMEOUT
 
 
 
216
  try:
217
- if key in always_on:
218
- detections = always_on[key](text)
 
 
219
  else:
220
  model = self._get_lazy_model(key)
221
- detections = model.scan(text) if model else []
 
 
 
222
  return key, {"detections": detections, "error": None, "timed_out": False}
223
  except Exception as e:
224
  print(f"[scan_with_models] Error in '{key}': {e}")
225
  return key, {"detections": [], "error": str(e), "timed_out": False}
226
 
227
  results: Dict[str, Any] = {}
228
- # Use min(len(model_keys), 4) workers β€” HF Spaces free tier has ~2 vCPUs
229
  max_workers = min(len(model_keys), 4)
230
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
231
  future_map = {pool.submit(_run_one, key): key for key in model_keys}
@@ -295,20 +311,22 @@ class RegexClassifier:
295
 
296
  all_matches = []
297
 
 
 
298
  # --- Always-On models (no lazy loading needed) ---
299
  if "regex" in selected_models: all_matches.extend(self.regex_scanner.scan(text))
300
  if "nltk" in selected_models: all_matches.extend(self.scan_with_nltk(text))
301
  if "spacy" in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
302
  if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
303
- if "gliner" in selected_models: all_matches.extend(self.gliner_analyzer.scan(text))
304
- if "deberta" in selected_models: all_matches.extend(self.deberta_analyzer.scan(text))
305
 
306
  # --- Lazy-loaded models (instantiated on first use) ---
307
  for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
308
  if lazy_key in selected_models:
309
  model = self._get_lazy_model(lazy_key)
310
  if model:
311
- all_matches.extend(model.scan(text))
312
 
313
  # Sort and Deduplicate by span overlap (keep longest match)
314
  all_matches.sort(key=lambda x: x['start'])
@@ -334,24 +352,116 @@ class RegexClassifier:
334
  if not selected_models:
335
  selected_models = list(self._ALWAYS_ON)
336
 
 
 
337
  model_results: Dict[str, list] = {}
338
 
339
  if "regex" in selected_models: model_results["πŸ› οΈ Regex"] = self.regex_scanner.scan(text)
340
  if "nltk" in selected_models: model_results["🧠 NLTK"] = self.scan_with_nltk(text)
341
  if "spacy" in selected_models: model_results["πŸ€– SpaCy"] = self.spacy_analyzer.scan(text)
342
  if "presidio" in selected_models: model_results["πŸ›‘οΈ Presidio"] = self.presidio_analyzer.scan(text)
343
- if "gliner" in selected_models: model_results["πŸ¦… GLiNER"] = self.gliner_analyzer.scan(text)
344
- if "deberta" in selected_models: model_results["πŸš€ DeBERTa"] = self.deberta_analyzer.scan(text)
345
 
346
  for lazy_key, label in [("pasteproof", "πŸ“‹ Pasteproof"), ("piiranha", "🐟 Piiranha"),
347
  ("nvidia_gliner", "⚑ NVIDIA-GLiNER"), ("mmbert", "🌐 mmbert32k")]:
348
  if lazy_key in selected_models:
349
  m = self._get_lazy_model(lazy_key)
350
  if m:
351
- model_results[label] = m.scan(text)
352
 
353
  return self.inspector.compare_models_dynamic(model_results)
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  # --- WRAPPERS FOR UI ---
356
  def get_json_data(self, file_obj) -> pd.DataFrame:
357
  return self.json_handler.read_file(file_obj)
 
196
  """
197
  Run selected models on text IN PARALLEL using a thread pool.
198
  Each model gets its own timeout. Timed-out / failed models return
199
+ {"detections": [], "error": "...", "timed_out": True}.
 
 
 
 
200
  """
201
+ from chunking_engine import run_model_with_chunking
202
+
203
+ # Models that do not need chunking (they process the entire string at once)
204
+ always_on_no_chunk = {
205
  "regex": lambda t: self.regex_scanner.scan(t),
206
  "nltk": lambda t: self.scan_with_nltk(t),
207
  "spacy": lambda t: self.spacy_analyzer.scan(t),
208
  "presidio": lambda t: self.presidio_analyzer.scan(t),
209
+ }
210
+
211
+ # Models that DO need chunking due to token limits
212
+ always_on_chunked = {
213
+ "gliner": lambda t: run_model_with_chunking(self.gliner_analyzer.scan, t),
214
+ "deberta": lambda t: run_model_with_chunking(self.deberta_analyzer.scan, t),
215
  }
216
 
217
+ # Handle the special 'ensemble' key
218
+ if "ensemble" in model_keys:
219
+ # We don't parallelize ensemble within the pool because it internally calls other models
220
+ # We will handle it separately below or let it run synchronously if it's the only one.
221
+ pass
222
+
223
  def _run_one(key: str):
224
+ if key == "ensemble":
225
+ return key, {"detections": self.run_weighted_ensemble(text), "error": None, "timed_out": False}
226
+
227
+ timeout = self._ALWAYS_ON_TIMEOUT if (key in always_on_no_chunk or key in always_on_chunked) else self._LAZY_MODEL_TIMEOUT
228
  try:
229
+ if key in always_on_no_chunk:
230
+ detections = always_on_no_chunk[key](text)
231
+ elif key in always_on_chunked:
232
+ detections = always_on_chunked[key](text)
233
  else:
234
  model = self._get_lazy_model(key)
235
+ if model:
236
+ detections = run_model_with_chunking(model.scan, text)
237
+ else:
238
+ detections = []
239
  return key, {"detections": detections, "error": None, "timed_out": False}
240
  except Exception as e:
241
  print(f"[scan_with_models] Error in '{key}': {e}")
242
  return key, {"detections": [], "error": str(e), "timed_out": False}
243
 
244
  results: Dict[str, Any] = {}
 
245
  max_workers = min(len(model_keys), 4)
246
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
247
  future_map = {pool.submit(_run_one, key): key for key in model_keys}
 
311
 
312
  all_matches = []
313
 
314
+ from chunking_engine import run_model_with_chunking
315
+
316
  # --- Always-On models (no lazy loading needed) ---
317
  if "regex" in selected_models: all_matches.extend(self.regex_scanner.scan(text))
318
  if "nltk" in selected_models: all_matches.extend(self.scan_with_nltk(text))
319
  if "spacy" in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
320
  if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
321
+ if "gliner" in selected_models: all_matches.extend(run_model_with_chunking(self.gliner_analyzer.scan, text))
322
+ if "deberta" in selected_models: all_matches.extend(run_model_with_chunking(self.deberta_analyzer.scan, text))
323
 
324
  # --- Lazy-loaded models (instantiated on first use) ---
325
  for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
326
  if lazy_key in selected_models:
327
  model = self._get_lazy_model(lazy_key)
328
  if model:
329
+ all_matches.extend(run_model_with_chunking(model.scan, text))
330
 
331
  # Sort and Deduplicate by span overlap (keep longest match)
332
  all_matches.sort(key=lambda x: x['start'])
 
352
  if not selected_models:
353
  selected_models = list(self._ALWAYS_ON)
354
 
355
+ from chunking_engine import run_model_with_chunking
356
+
357
  model_results: Dict[str, list] = {}
358
 
359
  if "regex" in selected_models: model_results["πŸ› οΈ Regex"] = self.regex_scanner.scan(text)
360
  if "nltk" in selected_models: model_results["🧠 NLTK"] = self.scan_with_nltk(text)
361
  if "spacy" in selected_models: model_results["πŸ€– SpaCy"] = self.spacy_analyzer.scan(text)
362
  if "presidio" in selected_models: model_results["πŸ›‘οΈ Presidio"] = self.presidio_analyzer.scan(text)
363
+ if "gliner" in selected_models: model_results["πŸ¦… GLiNER"] = run_model_with_chunking(self.gliner_analyzer.scan, text)
364
+ if "deberta" in selected_models: model_results["πŸš€ DeBERTa"] = run_model_with_chunking(self.deberta_analyzer.scan, text)
365
 
366
  for lazy_key, label in [("pasteproof", "πŸ“‹ Pasteproof"), ("piiranha", "🐟 Piiranha"),
367
  ("nvidia_gliner", "⚑ NVIDIA-GLiNER"), ("mmbert", "🌐 mmbert32k")]:
368
  if lazy_key in selected_models:
369
  m = self._get_lazy_model(lazy_key)
370
  if m:
371
+ model_results[label] = run_model_with_chunking(m.scan, text)
372
 
373
  return self.inspector.compare_models_dynamic(model_results)
374
 
375
+ def run_weighted_ensemble(self, text: str) -> List[dict]:
376
+ """
377
+ Runs the 'God Algorithm' Weighted Ensemble.
378
+ Combines rule-based (Regex/Presidio) and contextual (DeBERTa/GLiNER) detections,
379
+ weights them, and groups by Intersection-over-Union (IoU) to resolve conflicts.
380
+ """
381
+ from chunking_engine import run_model_with_chunking, deduplicate_overlapping_entities
382
+
383
+ raw_detections = []
384
+
385
+ # 1. Run all models and assign trust weights based on their architecture
386
+
387
+ # Highly trusted deterministic / rule-based models
388
+ for m in self.regex_scanner.scan(text):
389
+ m["weight"] = 1.0; m["source"] = "Ensemble (Regex)"
390
+ raw_detections.append(m)
391
+ for m in self.presidio_analyzer.scan(text):
392
+ m["weight"] = 0.95; m["source"] = "Ensemble (Presidio)"
393
+ raw_detections.append(m)
394
+
395
+ # Context-aware deep learning models (chunked)
396
+ for m in run_model_with_chunking(self.deberta_analyzer.scan, text):
397
+ m["weight"] = 0.85; m["source"] = "Ensemble (DeBERTa)"
398
+ raw_detections.append(m)
399
+ for m in run_model_with_chunking(self.gliner_analyzer.scan, text):
400
+ m["weight"] = 0.75; m["source"] = "Ensemble (GLiNER)"
401
+ raw_detections.append(m)
402
+
403
+ # Baseline statistical models
404
+ for m in self.spacy_analyzer.scan(text):
405
+ m["weight"] = 0.5; m["source"] = "Ensemble (SpaCy)"
406
+ raw_detections.append(m)
407
+
408
+ if not raw_detections:
409
+ return []
410
+
411
+ # 2. Cluster overlapping detections
412
+ # Sort by start coordinate to make grouping easier
413
+ raw_detections.sort(key=lambda x: x["start"])
414
+
415
+ clusters = []
416
+ current_cluster = [raw_detections[0]]
417
+
418
+ for det in raw_detections[1:]:
419
+ # If the current detection overlaps with the active cluster
420
+ # (i.e. start is before the end of the last item in the cluster)
421
+ if det["start"] <= max(x["end"] for x in current_cluster):
422
+ current_cluster.append(det)
423
+ else:
424
+ clusters.append(current_cluster)
425
+ current_cluster = [det]
426
+ clusters.append(current_cluster)
427
+
428
+ # 3. Resolve conflicts within each cluster
429
+ final_detections = []
430
+ for cluster in clusters:
431
+ if len(cluster) == 1:
432
+ final_detections.append(cluster[0])
433
+ continue
434
+
435
+ # Aggregate weights by label
436
+ label_weights = {}
437
+ for det in cluster:
438
+ lbl = det["label"]
439
+ w = det["weight"]
440
+ label_weights[lbl] = label_weights.get(lbl, 0) + w
441
+
442
+ # Pick the winning label
443
+ winning_label = max(label_weights.items(), key=lambda x: x[1])[0]
444
+
445
+ # Find the detection in this cluster that has the winning label and highest individual weight
446
+ candidates = [c for c in cluster if c["label"] == winning_label]
447
+ if not candidates:
448
+ candidates = cluster # fallback, shouldn't happen
449
+
450
+ best_det = max(candidates, key=lambda x: x["weight"])
451
+
452
+ # Optionally, expand boundaries to encompass the maximum matched area
453
+ min_start = min(c["start"] for c in cluster if c["label"] == winning_label)
454
+ max_end = max(c["end"] for c in cluster if c["label"] == winning_label)
455
+ best_det["start"] = min_start
456
+ best_det["end"] = max_end
457
+ best_det["text"] = text[min_start:max_end]
458
+
459
+ final_detections.append(best_det)
460
+
461
+ # 4. Final IoU Deduplication to clean up any remaining sloppy edges
462
+ return deduplicate_overlapping_entities(final_detections, iou_threshold=0.3)
463
+
464
+
465
  # --- WRAPPERS FOR UI ---
466
  def get_json_data(self, file_obj) -> pd.DataFrame:
467
  return self.json_handler.read_file(file_obj)
chunking_engine.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Segmento Chunker
3
+ Implements Presidio's sliding-window chunking logic for deep learning NLP models.
4
+ Prevents the 512-token limit truncation and deduplicates overlapping entities using IoU.
5
+ """
6
+
7
+ from typing import List, Dict, Any, Callable, Tuple
8
+ import logging
9
+
10
+ logger = logging.getLogger("segmento.chunker")
11
+
12
+ class TextChunk:
13
+ def __init__(self, text: str, start: int, end: int):
14
+ self.text = text
15
+ self.start = start
16
+ self.end = end
17
+
18
+ class SegmentoChunker:
19
+ """Character-based text chunker with word boundary preservation."""
20
+
21
+ def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200):
22
+ self.chunk_size = chunk_size
23
+ self.chunk_overlap = chunk_overlap
24
+ self.boundary_chars = (" ", "\n", "\t")
25
+
26
+ def chunk(self, text: str) -> List[TextChunk]:
27
+ """Split text into overlapping chunks, extending to the nearest word boundary."""
28
+ if not text:
29
+ return []
30
+
31
+ chunks = []
32
+ start = 0
33
+
34
+ while start < len(text):
35
+ end = min(start + self.chunk_size, len(text))
36
+
37
+ # Extend to complete word boundary
38
+ while end < len(text) and text[end] not in self.boundary_chars:
39
+ end += 1
40
+
41
+ chunks.append(TextChunk(text=text[start:end], start=start, end=end))
42
+
43
+ if end >= len(text):
44
+ break
45
+
46
+ # Slide window forward by (size - overlap)
47
+ # Actually, standard Presidio does: start = end - chunk_overlap
48
+ start = end - self.chunk_overlap
49
+
50
+ return chunks
51
+
52
+
53
+ def _calculate_iou(start1: int, end1: int, start2: int, end2: int) -> float:
54
+ """Calculate Intersection-over-Union for two text spans."""
55
+ intersection = max(0, min(end1, end2) - max(start1, start2))
56
+ if intersection == 0:
57
+ return 0.0
58
+ union = (end1 - start1) + (end2 - start2) - intersection
59
+ return intersection / union if union > 0 else 0.0
60
+
61
+ def deduplicate_overlapping_entities(entities: List[Dict[str, Any]], iou_threshold: float = 0.5) -> List[Dict[str, Any]]:
62
+ """
63
+ Remove duplicate entities that overlap across chunks.
64
+ Keeps the entity with the highest score.
65
+ """
66
+ if not entities:
67
+ return []
68
+
69
+ # Sort by score descending so we keep the highest confidence matches
70
+ # If score is missing, default to 1.0
71
+ entities = sorted(entities, key=lambda x: x.get('score', 1.0), reverse=True)
72
+
73
+ unique_entities = []
74
+
75
+ for entity in entities:
76
+ is_duplicate = False
77
+ for unique_ent in unique_entities:
78
+ # If same label and high IoU overlap
79
+ if entity.get("label") == unique_ent.get("label"):
80
+ iou = _calculate_iou(
81
+ entity["start"], entity["end"],
82
+ unique_ent["start"], unique_ent["end"]
83
+ )
84
+ if iou >= iou_threshold:
85
+ is_duplicate = True
86
+ break
87
+
88
+ if not is_duplicate:
89
+ unique_entities.append(entity)
90
+
91
+ # Sort back by original text position
92
+ unique_entities = sorted(unique_entities, key=lambda x: x["start"])
93
+ return unique_entities
94
+
95
+
96
+ def run_model_with_chunking(
97
+ scan_func: Callable[[str], List[Dict[str, Any]]],
98
+ text: str,
99
+ chunk_size: int = 2000,
100
+ overlap: int = 200
101
+ ) -> List[Dict[str, Any]]:
102
+ """
103
+ Wrapper function to pass large text through a Deep Learning model safely.
104
+ Splits text, runs the scan_func on each chunk, maps coordinates back, and deduplicates.
105
+ """
106
+ if not text:
107
+ return []
108
+
109
+ chunker = SegmentoChunker(chunk_size=chunk_size, chunk_overlap=overlap)
110
+ chunks = chunker.chunk(text)
111
+
112
+ all_detections = []
113
+
114
+ for chunk in chunks:
115
+ # Run inference on the chunk text
116
+ try:
117
+ chunk_results = scan_func(chunk.text)
118
+
119
+ for res in chunk_results:
120
+ # Map local coordinates to absolute document coordinates
121
+ absolute_start = res["start"] + chunk.start
122
+ absolute_end = res["end"] + chunk.start
123
+
124
+ mapped_res = res.copy()
125
+ mapped_res["start"] = absolute_start
126
+ mapped_res["end"] = absolute_end
127
+
128
+ all_detections.append(mapped_res)
129
+ except Exception as e:
130
+ logger.error(f"Error scanning chunk: {e}")
131
+ continue
132
+
133
+ # Deduplicate overlapping entities caused by the sliding window
134
+ deduplicated = deduplicate_overlapping_entities(all_detections, iou_threshold=0.5)
135
+ return deduplicated
evaluator_api.py CHANGED
@@ -39,6 +39,16 @@ def setup(classifier_instance):
39
  # ─────────────────────────────────────────────
40
 
41
  MODEL_CATALOGUE = [
 
 
 
 
 
 
 
 
 
 
42
  {
43
  "key": "regex",
44
  "label": "πŸ› οΈ Regex Engine",
 
39
  # ─────────────────────────────────────────────
40
 
41
  MODEL_CATALOGUE = [
42
+ {
43
+ "key": "ensemble",
44
+ "label": "πŸ‘‘ Ensemble (God Algorithm)",
45
+ "hf_id": "hybrid",
46
+ "type": "God Algorithm",
47
+ "params": "IoU",
48
+ "f1_benchmark": 0.99,
49
+ "lazy": False,
50
+ "description": "Aggregates all models using sliding windows and weighted IoU deduplication.",
51
+ },
52
  {
53
  "key": "regex",
54
  "label": "πŸ› οΈ Regex Engine",
file_handlers/universal_parser.py CHANGED
@@ -59,6 +59,29 @@ CATEGORY_FILE_TYPES = {
59
  ],
60
  }
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  def parse_file(file_bytes: bytes, file_type: str) -> str:
64
  """
@@ -77,11 +100,13 @@ def parse_file(file_bytes: bytes, file_type: str) -> str:
77
  if parser_fn is None:
78
  # Best-effort: try UTF-8 text decode
79
  try:
80
- return file_bytes.decode("utf-8", errors="replace")
81
  except Exception:
82
  return f"[No parser for file type: {file_type}]"
83
-
84
- return parser_fn(file_bytes)
 
 
85
 
86
 
87
  def get_all_categories() -> dict:
 
59
  ],
60
  }
61
 
62
+ import re
63
+
64
+ def normalize_text(text: str) -> str:
65
+ """
66
+ Cleans extracted text to improve NLP processing accuracy.
67
+ - Removes zero-width characters and invisible control characters.
68
+ - Normalizes repeated whitespaces (except newlines).
69
+ - Repairs broken sentences caused by PDF layout extraction.
70
+ """
71
+ if not text:
72
+ return ""
73
+
74
+ # 1. Remove zero-width characters and non-printable control chars (excluding \n, \t)
75
+ text = re.sub(r'[\u200b\u200c\u200d\uFEFF\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
76
+
77
+ # 2. Collapse multiple spaces/tabs into a single space
78
+ text = re.sub(r'[ \t]+', ' ', text)
79
+
80
+ # 3. Repair broken PDF lines: If a line does not end with punctuation or is short, join it.
81
+ # We replace a newline that is preceded by a lowercase letter and followed by a letter with a space.
82
+ text = re.sub(r'([a-z])\n([A-Za-z])', r'\1 \2', text)
83
+
84
+ return text.strip()
85
 
86
  def parse_file(file_bytes: bytes, file_type: str) -> str:
87
  """
 
100
  if parser_fn is None:
101
  # Best-effort: try UTF-8 text decode
102
  try:
103
+ raw_text = file_bytes.decode("utf-8", errors="replace")
104
  except Exception:
105
  return f"[No parser for file type: {file_type}]"
106
+ else:
107
+ raw_text = parser_fn(file_bytes)
108
+
109
+ return normalize_text(raw_text)
110
 
111
 
112
  def get_all_categories() -> dict: