Spaces:

HardikJha
/

extraction-arena

Sleeping

App Files Files Community

Hardikjha09 commited on Apr 25

Commit

b0854a3

0 Parent(s):

Add gradio app

Browse files

Files changed (7) hide show

app.py +171 -0
data/__init__.py +1 -0
data/corpus.py +37 -0
env/__init__.py +1 -0
env/adversary.py +151 -0
env/models.py +45 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+Gradio Space app for Adversarial Structured-Extraction Arena.
+This is a self-contained demo bundle intended for Hugging Face Spaces.
+It uses the repo's adversary edit executor and schema-driven extraction UI.
+If `data/corpus.json` is not present (common on Spaces), it falls back to
+two built-in sample documents.
+"""
+import json
+import gradio as gr
+from env.adversary import AdversaryEditExecutor
+from env.models import AdversaryEdit
+def _fallback_docs():
+    docs = [
+        {
+            "text": (
+                "TAX INVOICE\n"
+                "Supplier: ABC Traders\n"
+                "GSTIN: 29ABCDE1234F1Z5\n"
+                "Invoice No: INV-1029\n"
+                "Invoice Date: 12/03/2025\n"
+                "Bill To: Rahul Sharma\n"
+                "Phone: 9876543210\n"
+                "Total Amount: ₹ 12,450.00\n"
+            ),
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "supplier_name": {"type": "string"},
+                    "gstin": {"type": "string"},
+                    "invoice_number": {"type": "string"},
+                    "invoice_date": {"type": "string"},
+                    "customer_name": {"type": "string"},
+                    "phone": {"type": "string"},
+                    "total_amount": {"type": "number"},
+                },
+                "required": ["gstin", "invoice_number", "invoice_date", "total_amount"],
+                "additionalProperties": False,
+            },
+        },
+        {
+            "text": (
+                "BANK STATEMENT\n"
+                "Account Holder: Priya Verma\n"
+                "Account No: 001234567890\n"
+                "IFSC: HDFC0001234\n"
+                "Period: 01/01/2025 - 31/01/2025\n"
+                "Closing Balance: INR 54,210.75\n"
+            ),
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "account_holder": {"type": "string"},
+                    "account_number": {"type": "string"},
+                    "ifsc": {"type": "string"},
+                    "period_start": {"type": "string"},
+                    "period_end": {"type": "string"},
+                    "closing_balance": {"type": "number"},
+                },
+                "required": ["account_number", "ifsc", "closing_balance"],
+                "additionalProperties": False,
+            },
+        },
+    ]
+    return docs
+def _load_corpus_if_available():
+    try:
+        from data.corpus import DocumentCorpus
+        corpus = DocumentCorpus(split="holdout")
+        # Smoke check (can still be empty)
+        _ = corpus.sample()
+        return corpus
+    except Exception:
+        return None
+executor = AdversaryEditExecutor()
+corpus = _load_corpus_if_available()
+fallback_docs = _fallback_docs()
+fallback_idx = 0
+def load_random_doc():
+    global fallback_idx
+    if corpus is not None:
+        doc = corpus.sample()
+        return doc["text"], json.dumps(doc["schema"], indent=2)
+    doc = fallback_docs[fallback_idx % len(fallback_docs)]
+    fallback_idx += 1
+    return doc["text"], json.dumps(doc["schema"], indent=2)
+def apply_perturbation(doc_text, schema_text, edit_intensity):
+    try:
+        schema = json.loads(schema_text)
+    except Exception:
+        schema = {}
+    edit = AdversaryEdit(
+        edit_type="ocr_noise",
+        params={"intensity": float(edit_intensity)},
+        token_cost=10,  # corrected by model validator
+    )
+    mod_doc, mod_schema = executor.apply_edits(doc_text, schema, [edit])
+    return mod_doc, json.dumps(mod_schema, indent=2)
+def extract_data(doc_text, schema_text):
+    """
+    Placeholder extractor.
+    For a fully aligned repo demo, replace this function with a call to:
+    - your hosted extractor model (HF Inference), or
+    - a local model in the Space (GPU Space).
+    """
+    try:
+        schema = json.loads(schema_text)
+        extracted = {}
+        if isinstance(schema, dict) and "properties" in schema and isinstance(schema["properties"], dict):
+            for k in schema["properties"].keys():
+                extracted[k] = "[Extracted Value]"
+        return json.dumps(extracted, indent=2)
+    except Exception:
+        return "{}"
+with gr.Blocks(title="Adversarial Extraction Arena") as demo:
+    gr.Markdown("# Adversarial Structured-Extraction Arena")
+    gr.Markdown("Agent A perturbs documents. Agent E extracts structured data despite the noise.")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Original Document")
+            doc_input = gr.TextArea(label="Document Text", lines=10)
+            schema_input = gr.TextArea(label="Target Schema", lines=10)
+            load_btn = gr.Button("Load Random Document")
+        with gr.Column():
+            gr.Markdown("### Adversary (Agent A)")
+            intensity_slider = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.2, step=0.1, label="Noise Intensity"
+            )
+            perturb_btn = gr.Button("Apply Perturbation")
+            mod_doc_output = gr.TextArea(label="Perturbed Document", lines=10)
+        with gr.Column():
+            gr.Markdown("### Extractor (Agent E)")
+            extract_btn = gr.Button("Run Extractor")
+            extracted_output = gr.TextArea(label="Extracted JSON", lines=10)
+    load_btn.click(fn=load_random_doc, inputs=[], outputs=[doc_input, schema_input])
+    perturb_btn.click(
+        fn=apply_perturbation,
+        inputs=[doc_input, schema_input, intensity_slider],
+        outputs=[mod_doc_output, schema_input],
+    )
+    extract_btn.click(fn=extract_data, inputs=[mod_doc_output, schema_input], outputs=[extracted_output])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __all__ = []

data/corpus.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+import random
+from typing import Any, Dict, List
+class DocumentCorpus:
+    """Loads and manages the document corpus for training and evaluation."""
+    def __init__(self, data_file: str = "data/corpus.json", split: str = "train"):
+        self.data_file = data_file
+        self.split = split
+        self.documents = self._load_data()
+    def _load_data(self) -> List[Dict[str, Any]]:
+        try:
+            with open(self.data_file, "r", encoding="utf-8") as f:
+                all_docs = json.load(f)
+            # Filter by split
+            docs = [doc for doc in all_docs if doc.get("split", "train") == self.split]
+            if not docs:
+                print(f"Warning: No documents found for split '{self.split}'.")
+            return docs
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"Corpus file {self.data_file} not found. Please run 'python data/generator.py' first."
+            )
+    def sample(self) -> Dict[str, Any]:
+        """Returns a random document from the corpus."""
+        if not self.documents:
+            raise ValueError(f"Corpus is empty for split {self.split}.")
+        return random.choice(self.documents)
+    def get_all(self) -> List[Dict[str, Any]]:
+        """Returns all documents in this split (useful for eval)."""
+        return self.documents

env/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __all__ = []

env/adversary.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import re
+import random
+from typing import Tuple
+# OCR confusion map — realistic character substitutions
+OCR_MAP = {
+    "0": "O",
+    "O": "0",
+    "1": "l",
+    "l": "1",
+    "I": "1",
+    "5": "S",
+    "S": "5",
+    "8": "B",
+    "B": "8",
+    "6": "G",
+    "rn": "m",
+    "vv": "w",
+    "cl": "d",
+}
+class AdversaryEditExecutor:
+    """Applies structured edit programs to document text and schema."""
+    def apply_edits(self, document: str, schema: dict, edits: list) -> Tuple[str, dict]:
+        """Apply all edits in sequence. Returns (modified_doc, modified_schema)."""
+        doc, sch = document, schema.copy()
+        # Ensure deep copy of properties
+        if "properties" in sch:
+            sch["properties"] = sch["properties"].copy()
+        if "required" in sch:
+            sch["required"] = sch["required"].copy()
+        for edit in edits:
+            doc, sch = self.apply_single_edit(doc, sch, edit)
+        return doc, sch
+    def apply_single_edit(self, doc: str, schema: dict, edit) -> Tuple[str, dict]:
+        t = edit.edit_type.value
+        p = edit.params
+        try:
+            if t == "rename_field":
+                return self.rename_field(doc, schema, p.get("old_name", ""), p.get("new_name", ""))
+            elif t == "swap_type":
+                return self.swap_type(doc, schema, p.get("field", ""), p.get("new_type", "string"))
+            elif t == "inject_distractor":
+                return self.inject_distractor(doc, schema, p.get("content", ""))
+            elif t == "mutate_format":
+                return self.mutate_format(doc, schema, p.get("field", ""), p.get("pattern", ""))
+            elif t == "add_required_field":
+                return self.add_required_field(doc, schema, p.get("name", ""), p.get("value", ""))
+            elif t == "ocr_noise":
+                return self.ocr_noise(doc, schema, float(p.get("intensity", 0.3)))
+            elif t == "swap_columns":
+                return self.swap_columns(doc, schema, p.get("col_a", 0), p.get("col_b", 1))
+        except Exception:
+            # Silently fail on bad adversary actions
+            pass
+        return doc, schema
+    def rename_field(self, doc: str, schema: dict, old_name: str, new_name: str) -> Tuple[str, dict]:
+        if not old_name or not new_name:
+            return doc, schema
+        # Replace old_name with new_name in doc text (case-insensitive, word-boundary safe)
+        pattern = re.compile(r"\b" + re.escape(old_name) + r"\b", re.IGNORECASE)
+        new_doc = pattern.sub(new_name, doc)
+        # Also update schema: rename the property key
+        new_schema = schema
+        if "properties" in new_schema and old_name in new_schema["properties"]:
+            prop = new_schema["properties"].pop(old_name)
+            new_schema["properties"][new_name] = prop
+        if "required" in new_schema and old_name in new_schema["required"]:
+            new_schema["required"].remove(old_name)
+            new_schema["required"].append(new_name)
+        return new_doc, new_schema
+    def swap_type(self, doc: str, schema: dict, field: str, new_type: str) -> Tuple[str, dict]:
+        if "properties" in schema and field in schema["properties"]:
+            schema["properties"][field]["type"] = new_type
+        return doc, schema
+    def ocr_noise(self, doc: str, schema: dict, intensity: float) -> Tuple[str, dict]:
+        # Apply OCR_MAP substitutions to `intensity` fraction of eligible chars
+        intensity = max(0.0, min(1.0, intensity))
+        chars = list(doc)
+        for i, char in enumerate(chars):
+            if random.random() < intensity:
+                if char in OCR_MAP:
+                    chars[i] = OCR_MAP[char]
+                elif char.isdigit() and random.random() < 0.1:
+                    chars[i] = char + " " if random.random() > 0.5 else ""
+        return "".join(chars), schema
+    def mutate_format(self, doc: str, schema: dict, field: str, pattern: str) -> Tuple[str, dict]:
+        if pattern == "date_dmy_to_mdy":
+            doc = re.sub(r"(\d{2})/(\d{2})/(\d{4})", r"\2-\1-\3", doc)
+        elif pattern == "date_dmy_to_iso":
+            doc = re.sub(r"(\d{2})/(\d{2})/(\d{4})", r"\3-\2-\1", doc)
+        elif pattern == "currency_symbol_to_text":
+            doc = doc.replace("₹", "INR ")
+        elif pattern == "phone_compact_to_dashed":
+            doc = re.sub(
+                r"(?<!\d)(\d{10})(?!\d)",
+                r"\g<1>"[:3] + "-" + r"\g<1>"[3:6] + "-" + r"\g<1>"[6:],
+                doc,
+            )
+        return doc, schema
+    def inject_distractor(self, doc: str, schema: dict, content: str) -> Tuple[str, dict]:
+        if not content:
+            content = "Random distractor line item that means nothing."
+        lines = doc.split("\n")
+        # Insert near the end but not at the very end
+        idx = max(0, len(lines) - 2 - random.randint(0, 3))
+        lines.insert(idx, content)
+        return "\n".join(lines), schema
+    def add_required_field(self, doc: str, schema: dict, name: str, value: str) -> Tuple[str, dict]:
+        if not name:
+            return doc, schema
+        # Append "name: value" to document
+        doc += f"\n{name}: {value}"
+        # AND add name to schema required fields
+        if "properties" in schema:
+            schema["properties"][name] = {"type": "string"}
+        if "required" in schema:
+            schema["required"].append(name)
+        return doc, schema
+    def swap_columns(self, doc: str, schema: dict, col_a: int, col_b: int) -> Tuple[str, dict]:
+        # Too complex for quick implementation, just return doc for now
+        # Would parse tables and swap
+        return doc, schema
+    def is_document_parseable(self, original_doc: str, modified_doc: str) -> bool:
+        # Fail if more than 40% of key:value patterns from original are unrecognizable
+        orig_matches = len(re.findall(r"[\w\s]+:\s*[\w\s₹/\-\.,]+", original_doc))
+        mod_matches = len(re.findall(r"[\w\s]+:\s*[\w\s₹/\-\.,]+", modified_doc))
+        if orig_matches == 0:
+            return True  # Not applicable
+        return (mod_matches / orig_matches) >= 0.6
+    def validate_budget(self, edits: list, budget: int) -> bool:
+        return sum(e.token_cost for e in edits) <= budget

env/models.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field, model_validator
+class EditType(str, Enum):
+    rename_field = "rename_field"
+    swap_type = "swap_type"
+    inject_distractor = "inject_distractor"
+    mutate_format = "mutate_format"
+    add_required_field = "add_required_field"
+    ocr_noise = "ocr_noise"
+    swap_columns = "swap_columns"
+EDIT_TOKEN_COSTS = {
+    "rename_field": 10,
+    "swap_type": 15,
+    "inject_distractor": 25,
+    "mutate_format": 10,
+    "add_required_field": 20,
+    "ocr_noise": 5,
+    "swap_columns": 15,
+}
+class AdversaryEdit(BaseModel):
+    edit_type: EditType
+    params: Dict[str, Any]
+    token_cost: int
+    @model_validator(mode="after")
+    def validate_cost(self):
+        expected = EDIT_TOKEN_COSTS[self.edit_type.value]
+        if self.token_cost != expected:
+            self.token_cost = expected
+        return self
+class ExtractorAction(BaseModel):
+    extracted_json: Dict[str, Any]
+    drift_detected: Optional[List[Dict[str, str]]] = None  # [{"field": str, "reason": str}]
+    confidence: float = Field(ge=0.0, le=1.0, default=0.5)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio>=4.20.0
2	+ pydantic>=2.0.0