Spaces:

SHAFISK17
/

sense-backend

Running

App Files Files Community

SHAFI commited on 13 days ago

Commit

8ab5f69

1 Parent(s): 0132d3b

improved text extraction and chunking

Browse files

Files changed (4) hide show

backend.py +129 -19
chunking_engine.py +135 -0
evaluator_api.py +10 -0
file_handlers/universal_parser.py +28 -3

backend.py CHANGED Viewed

@@ -196,36 +196,52 @@ class RegexClassifier:
         """
         Run selected models on text IN PARALLEL using a thread pool.
         Each model gets its own timeout. Timed-out / failed models return
-        {"detections": [], "error": "...", "timed_out": True} so the
-        caller always gets partial results rather than a total failure.
-        Returns:
-            Dict[model_key -> {"detections": List[dict], "error": str|None, "timed_out": bool}]
         """
-        always_on = {
             "regex":    lambda t: self.regex_scanner.scan(t),
             "nltk":     lambda t: self.scan_with_nltk(t),
             "spacy":    lambda t: self.spacy_analyzer.scan(t),
             "presidio": lambda t: self.presidio_analyzer.scan(t),
-            "gliner":   lambda t: self.gliner_analyzer.scan(t),
-            "deberta":  lambda t: self.deberta_analyzer.scan(t),
         }
         def _run_one(key: str):
-            timeout = self._ALWAYS_ON_TIMEOUT if key in always_on else self._LAZY_MODEL_TIMEOUT
             try:
-                if key in always_on:
-                    detections = always_on[key](text)
                 else:
                     model = self._get_lazy_model(key)
-                    detections = model.scan(text) if model else []
                 return key, {"detections": detections, "error": None, "timed_out": False}
             except Exception as e:
                 print(f"[scan_with_models] Error in '{key}': {e}")
                 return key, {"detections": [], "error": str(e), "timed_out": False}
         results: Dict[str, Any] = {}
-        # Use min(len(model_keys), 4) workers — HF Spaces free tier has ~2 vCPUs
         max_workers = min(len(model_keys), 4)
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
             future_map = {pool.submit(_run_one, key): key for key in model_keys}
@@ -295,20 +311,22 @@ class RegexClassifier:
         all_matches = []
         # --- Always-On models (no lazy loading needed) ---
         if "regex"    in selected_models: all_matches.extend(self.regex_scanner.scan(text))
         if "nltk"     in selected_models: all_matches.extend(self.scan_with_nltk(text))
         if "spacy"    in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
         if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
-        if "gliner"   in selected_models: all_matches.extend(self.gliner_analyzer.scan(text))
-        if "deberta"  in selected_models: all_matches.extend(self.deberta_analyzer.scan(text))
         # --- Lazy-loaded models (instantiated on first use) ---
         for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
             if lazy_key in selected_models:
                 model = self._get_lazy_model(lazy_key)
                 if model:
-                    all_matches.extend(model.scan(text))
         # Sort and Deduplicate by span overlap (keep longest match)
         all_matches.sort(key=lambda x: x['start'])
@@ -334,24 +352,116 @@ class RegexClassifier:
         if not selected_models:
             selected_models = list(self._ALWAYS_ON)
         model_results: Dict[str, list] = {}
         if "regex"    in selected_models: model_results["🛠️ Regex"]    = self.regex_scanner.scan(text)
         if "nltk"     in selected_models: model_results["🧠 NLTK"]     = self.scan_with_nltk(text)
         if "spacy"    in selected_models: model_results["🤖 SpaCy"]    = self.spacy_analyzer.scan(text)
         if "presidio" in selected_models: model_results["🛡️ Presidio"] = self.presidio_analyzer.scan(text)
-        if "gliner"   in selected_models: model_results["🦅 GLiNER"]   = self.gliner_analyzer.scan(text)
-        if "deberta"  in selected_models: model_results["🚀 DeBERTa"]  = self.deberta_analyzer.scan(text)
         for lazy_key, label in [("pasteproof", "📋 Pasteproof"), ("piiranha", "🐟 Piiranha"),
                                  ("nvidia_gliner", "⚡ NVIDIA-GLiNER"), ("mmbert", "🌐 mmbert32k")]:
             if lazy_key in selected_models:
                 m = self._get_lazy_model(lazy_key)
                 if m:
-                    model_results[label] = m.scan(text)
         return self.inspector.compare_models_dynamic(model_results)
     # --- WRAPPERS FOR UI ---
     def get_json_data(self, file_obj) -> pd.DataFrame:
         return self.json_handler.read_file(file_obj)

         """
         Run selected models on text IN PARALLEL using a thread pool.
         Each model gets its own timeout. Timed-out / failed models return
+        {"detections": [], "error": "...", "timed_out": True}.
         """
+        from chunking_engine import run_model_with_chunking
+        # Models that do not need chunking (they process the entire string at once)
+        always_on_no_chunk = {
             "regex":    lambda t: self.regex_scanner.scan(t),
             "nltk":     lambda t: self.scan_with_nltk(t),
             "spacy":    lambda t: self.spacy_analyzer.scan(t),
             "presidio": lambda t: self.presidio_analyzer.scan(t),
+        }
+        # Models that DO need chunking due to token limits
+        always_on_chunked = {
+            "gliner":   lambda t: run_model_with_chunking(self.gliner_analyzer.scan, t),
+            "deberta":  lambda t: run_model_with_chunking(self.deberta_analyzer.scan, t),
         }
+        # Handle the special 'ensemble' key
+        if "ensemble" in model_keys:
+            # We don't parallelize ensemble within the pool because it internally calls other models
+            # We will handle it separately below or let it run synchronously if it's the only one.
+            pass
         def _run_one(key: str):
+            if key == "ensemble":
+                return key, {"detections": self.run_weighted_ensemble(text), "error": None, "timed_out": False}
+            timeout = self._ALWAYS_ON_TIMEOUT if (key in always_on_no_chunk or key in always_on_chunked) else self._LAZY_MODEL_TIMEOUT
             try:
+                if key in always_on_no_chunk:
+                    detections = always_on_no_chunk[key](text)
+                elif key in always_on_chunked:
+                    detections = always_on_chunked[key](text)
                 else:
                     model = self._get_lazy_model(key)
+                    if model:
+                        detections = run_model_with_chunking(model.scan, text)
+                    else:
+                        detections = []
                 return key, {"detections": detections, "error": None, "timed_out": False}
             except Exception as e:
                 print(f"[scan_with_models] Error in '{key}': {e}")
                 return key, {"detections": [], "error": str(e), "timed_out": False}
         results: Dict[str, Any] = {}
         max_workers = min(len(model_keys), 4)
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
             future_map = {pool.submit(_run_one, key): key for key in model_keys}
         all_matches = []
+        from chunking_engine import run_model_with_chunking
         # --- Always-On models (no lazy loading needed) ---
         if "regex"    in selected_models: all_matches.extend(self.regex_scanner.scan(text))
         if "nltk"     in selected_models: all_matches.extend(self.scan_with_nltk(text))
         if "spacy"    in selected_models: all_matches.extend(self.spacy_analyzer.scan(text))
         if "presidio" in selected_models: all_matches.extend(self.presidio_analyzer.scan(text))
+        if "gliner"   in selected_models: all_matches.extend(run_model_with_chunking(self.gliner_analyzer.scan, text))
+        if "deberta"  in selected_models: all_matches.extend(run_model_with_chunking(self.deberta_analyzer.scan, text))
         # --- Lazy-loaded models (instantiated on first use) ---
         for lazy_key in ["pasteproof", "piiranha", "nvidia_gliner", "mmbert"]:
             if lazy_key in selected_models:
                 model = self._get_lazy_model(lazy_key)
                 if model:
+                    all_matches.extend(run_model_with_chunking(model.scan, text))
         # Sort and Deduplicate by span overlap (keep longest match)
         all_matches.sort(key=lambda x: x['start'])
         if not selected_models:
             selected_models = list(self._ALWAYS_ON)
+        from chunking_engine import run_model_with_chunking
         model_results: Dict[str, list] = {}
         if "regex"    in selected_models: model_results["🛠️ Regex"]    = self.regex_scanner.scan(text)
         if "nltk"     in selected_models: model_results["🧠 NLTK"]     = self.scan_with_nltk(text)
         if "spacy"    in selected_models: model_results["🤖 SpaCy"]    = self.spacy_analyzer.scan(text)
         if "presidio" in selected_models: model_results["🛡️ Presidio"] = self.presidio_analyzer.scan(text)
+        if "gliner"   in selected_models: model_results["🦅 GLiNER"]   = run_model_with_chunking(self.gliner_analyzer.scan, text)
+        if "deberta"  in selected_models: model_results["🚀 DeBERTa"]  = run_model_with_chunking(self.deberta_analyzer.scan, text)
         for lazy_key, label in [("pasteproof", "📋 Pasteproof"), ("piiranha", "🐟 Piiranha"),
                                  ("nvidia_gliner", "⚡ NVIDIA-GLiNER"), ("mmbert", "🌐 mmbert32k")]:
             if lazy_key in selected_models:
                 m = self._get_lazy_model(lazy_key)
                 if m:
+                    model_results[label] = run_model_with_chunking(m.scan, text)
         return self.inspector.compare_models_dynamic(model_results)
+    def run_weighted_ensemble(self, text: str) -> List[dict]:
+        """
+        Runs the 'God Algorithm' Weighted Ensemble.
+        Combines rule-based (Regex/Presidio) and contextual (DeBERTa/GLiNER) detections,
+        weights them, and groups by Intersection-over-Union (IoU) to resolve conflicts.
+        """
+        from chunking_engine import run_model_with_chunking, deduplicate_overlapping_entities
+        raw_detections = []
+        # 1. Run all models and assign trust weights based on their architecture
+        # Highly trusted deterministic / rule-based models
+        for m in self.regex_scanner.scan(text):
+            m["weight"] = 1.0; m["source"] = "Ensemble (Regex)"
+            raw_detections.append(m)
+        for m in self.presidio_analyzer.scan(text):
+            m["weight"] = 0.95; m["source"] = "Ensemble (Presidio)"
+            raw_detections.append(m)
+        # Context-aware deep learning models (chunked)
+        for m in run_model_with_chunking(self.deberta_analyzer.scan, text):
+            m["weight"] = 0.85; m["source"] = "Ensemble (DeBERTa)"
+            raw_detections.append(m)
+        for m in run_model_with_chunking(self.gliner_analyzer.scan, text):
+            m["weight"] = 0.75; m["source"] = "Ensemble (GLiNER)"
+            raw_detections.append(m)
+        # Baseline statistical models
+        for m in self.spacy_analyzer.scan(text):
+            m["weight"] = 0.5; m["source"] = "Ensemble (SpaCy)"
+            raw_detections.append(m)
+        if not raw_detections:
+            return []
+        # 2. Cluster overlapping detections
+        # Sort by start coordinate to make grouping easier
+        raw_detections.sort(key=lambda x: x["start"])
+        clusters = []
+        current_cluster = [raw_detections[0]]
+        for det in raw_detections[1:]:
+            # If the current detection overlaps with the active cluster
+            # (i.e. start is before the end of the last item in the cluster)
+            if det["start"] <= max(x["end"] for x in current_cluster):
+                current_cluster.append(det)
+            else:
+                clusters.append(current_cluster)
+                current_cluster = [det]
+        clusters.append(current_cluster)
+        # 3. Resolve conflicts within each cluster
+        final_detections = []
+        for cluster in clusters:
+            if len(cluster) == 1:
+                final_detections.append(cluster[0])
+                continue
+            # Aggregate weights by label
+            label_weights = {}
+            for det in cluster:
+                lbl = det["label"]
+                w = det["weight"]
+                label_weights[lbl] = label_weights.get(lbl, 0) + w
+            # Pick the winning label
+            winning_label = max(label_weights.items(), key=lambda x: x[1])[0]
+            # Find the detection in this cluster that has the winning label and highest individual weight
+            candidates = [c for c in cluster if c["label"] == winning_label]
+            if not candidates:
+                candidates = cluster # fallback, shouldn't happen
+            best_det = max(candidates, key=lambda x: x["weight"])
+            # Optionally, expand boundaries to encompass the maximum matched area
+            min_start = min(c["start"] for c in cluster if c["label"] == winning_label)
+            max_end = max(c["end"] for c in cluster if c["label"] == winning_label)
+            best_det["start"] = min_start
+            best_det["end"] = max_end
+            best_det["text"] = text[min_start:max_end]
+            final_detections.append(best_det)
+        # 4. Final IoU Deduplication to clean up any remaining sloppy edges
+        return deduplicate_overlapping_entities(final_detections, iou_threshold=0.3)
     # --- WRAPPERS FOR UI ---
     def get_json_data(self, file_obj) -> pd.DataFrame:
         return self.json_handler.read_file(file_obj)

chunking_engine.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Segmento Chunker
+Implements Presidio's sliding-window chunking logic for deep learning NLP models.
+Prevents the 512-token limit truncation and deduplicates overlapping entities using IoU.
+"""
+from typing import List, Dict, Any, Callable, Tuple
+import logging
+logger = logging.getLogger("segmento.chunker")
+class TextChunk:
+    def __init__(self, text: str, start: int, end: int):
+        self.text = text
+        self.start = start
+        self.end = end
+class SegmentoChunker:
+    """Character-based text chunker with word boundary preservation."""
+    def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.boundary_chars = (" ", "\n", "\t")
+    def chunk(self, text: str) -> List[TextChunk]:
+        """Split text into overlapping chunks, extending to the nearest word boundary."""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = min(start + self.chunk_size, len(text))
+            # Extend to complete word boundary
+            while end < len(text) and text[end] not in self.boundary_chars:
+                end += 1
+            chunks.append(TextChunk(text=text[start:end], start=start, end=end))
+            if end >= len(text):
+                break
+            # Slide window forward by (size - overlap)
+            # Actually, standard Presidio does: start = end - chunk_overlap
+            start = end - self.chunk_overlap
+        return chunks
+def _calculate_iou(start1: int, end1: int, start2: int, end2: int) -> float:
+    """Calculate Intersection-over-Union for two text spans."""
+    intersection = max(0, min(end1, end2) - max(start1, start2))
+    if intersection == 0:
+        return 0.0
+    union = (end1 - start1) + (end2 - start2) - intersection
+    return intersection / union if union > 0 else 0.0
+def deduplicate_overlapping_entities(entities: List[Dict[str, Any]], iou_threshold: float = 0.5) -> List[Dict[str, Any]]:
+    """
+    Remove duplicate entities that overlap across chunks.
+    Keeps the entity with the highest score.
+    """
+    if not entities:
+        return []
+    # Sort by score descending so we keep the highest confidence matches
+    # If score is missing, default to 1.0
+    entities = sorted(entities, key=lambda x: x.get('score', 1.0), reverse=True)
+    unique_entities = []
+    for entity in entities:
+        is_duplicate = False
+        for unique_ent in unique_entities:
+            # If same label and high IoU overlap
+            if entity.get("label") == unique_ent.get("label"):
+                iou = _calculate_iou(
+                    entity["start"], entity["end"],
+                    unique_ent["start"], unique_ent["end"]
+                )
+                if iou >= iou_threshold:
+                    is_duplicate = True
+                    break
+        if not is_duplicate:
+            unique_entities.append(entity)
+    # Sort back by original text position
+    unique_entities = sorted(unique_entities, key=lambda x: x["start"])
+    return unique_entities
+def run_model_with_chunking(
+    scan_func: Callable[[str], List[Dict[str, Any]]],
+    text: str,
+    chunk_size: int = 2000,
+    overlap: int = 200
+) -> List[Dict[str, Any]]:
+    """
+    Wrapper function to pass large text through a Deep Learning model safely.
+    Splits text, runs the scan_func on each chunk, maps coordinates back, and deduplicates.
+    """
+    if not text:
+        return []
+    chunker = SegmentoChunker(chunk_size=chunk_size, chunk_overlap=overlap)
+    chunks = chunker.chunk(text)
+    all_detections = []
+    for chunk in chunks:
+        # Run inference on the chunk text
+        try:
+            chunk_results = scan_func(chunk.text)
+            for res in chunk_results:
+                # Map local coordinates to absolute document coordinates
+                absolute_start = res["start"] + chunk.start
+                absolute_end = res["end"] + chunk.start
+                mapped_res = res.copy()
+                mapped_res["start"] = absolute_start
+                mapped_res["end"] = absolute_end
+                all_detections.append(mapped_res)
+        except Exception as e:
+            logger.error(f"Error scanning chunk: {e}")
+            continue
+    # Deduplicate overlapping entities caused by the sliding window
+    deduplicated = deduplicate_overlapping_entities(all_detections, iou_threshold=0.5)
+    return deduplicated

evaluator_api.py CHANGED Viewed

@@ -39,6 +39,16 @@ def setup(classifier_instance):
 # ─────────────────────────────────────────────
 MODEL_CATALOGUE = [
     {
         "key": "regex",
         "label": "🛠️ Regex Engine",

 # ─────────────────────────────────────────────
 MODEL_CATALOGUE = [
+    {
+        "key": "ensemble",
+        "label": "👑 Ensemble (God Algorithm)",
+        "hf_id": "hybrid",
+        "type": "God Algorithm",
+        "params": "IoU",
+        "f1_benchmark": 0.99,
+        "lazy": False,
+        "description": "Aggregates all models using sliding windows and weighted IoU deduplication.",
+    },
     {
         "key": "regex",
         "label": "🛠️ Regex Engine",

file_handlers/universal_parser.py CHANGED Viewed

@@ -59,6 +59,29 @@ CATEGORY_FILE_TYPES = {
     ],
 }
 def parse_file(file_bytes: bytes, file_type: str) -> str:
     """
@@ -77,11 +100,13 @@ def parse_file(file_bytes: bytes, file_type: str) -> str:
     if parser_fn is None:
         # Best-effort: try UTF-8 text decode
         try:
-            return file_bytes.decode("utf-8", errors="replace")
         except Exception:
             return f"[No parser for file type: {file_type}]"
-    return parser_fn(file_bytes)
 def get_all_categories() -> dict:

     ],
 }
+import re
+def normalize_text(text: str) -> str:
+    """
+    Cleans extracted text to improve NLP processing accuracy.
+    - Removes zero-width characters and invisible control characters.
+    - Normalizes repeated whitespaces (except newlines).
+    - Repairs broken sentences caused by PDF layout extraction.
+    """
+    if not text:
+        return ""
+    # 1. Remove zero-width characters and non-printable control chars (excluding \n, \t)
+    text = re.sub(r'[\u200b\u200c\u200d\uFEFF\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+    # 2. Collapse multiple spaces/tabs into a single space
+    text = re.sub(r'[ \t]+', ' ', text)
+    # 3. Repair broken PDF lines: If a line does not end with punctuation or is short, join it.
+    # We replace a newline that is preceded by a lowercase letter and followed by a letter with a space.
+    text = re.sub(r'([a-z])\n([A-Za-z])', r'\1 \2', text)
+    return text.strip()
 def parse_file(file_bytes: bytes, file_type: str) -> str:
     """
     if parser_fn is None:
         # Best-effort: try UTF-8 text decode
         try:
+            raw_text = file_bytes.decode("utf-8", errors="replace")
         except Exception:
             return f"[No parser for file type: {file_type}]"
+    else:
+        raw_text = parser_fn(file_bytes)
+    return normalize_text(raw_text)
 def get_all_categories() -> dict: