Hardikjha09 commited on
Commit
b0854a3
·
0 Parent(s):

Add gradio app

Browse files
Files changed (7) hide show
  1. app.py +171 -0
  2. data/__init__.py +1 -0
  3. data/corpus.py +37 -0
  4. env/__init__.py +1 -0
  5. env/adversary.py +151 -0
  6. env/models.py +45 -0
  7. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Space app for Adversarial Structured-Extraction Arena.
3
+
4
+ This is a self-contained demo bundle intended for Hugging Face Spaces.
5
+ It uses the repo's adversary edit executor and schema-driven extraction UI.
6
+ If `data/corpus.json` is not present (common on Spaces), it falls back to
7
+ two built-in sample documents.
8
+ """
9
+
10
+ import json
11
+ import gradio as gr
12
+
13
+ from env.adversary import AdversaryEditExecutor
14
+ from env.models import AdversaryEdit
15
+
16
+
17
+ def _fallback_docs():
18
+ docs = [
19
+ {
20
+ "text": (
21
+ "TAX INVOICE\n"
22
+ "Supplier: ABC Traders\n"
23
+ "GSTIN: 29ABCDE1234F1Z5\n"
24
+ "Invoice No: INV-1029\n"
25
+ "Invoice Date: 12/03/2025\n"
26
+ "Bill To: Rahul Sharma\n"
27
+ "Phone: 9876543210\n"
28
+ "Total Amount: ₹ 12,450.00\n"
29
+ ),
30
+ "schema": {
31
+ "type": "object",
32
+ "properties": {
33
+ "supplier_name": {"type": "string"},
34
+ "gstin": {"type": "string"},
35
+ "invoice_number": {"type": "string"},
36
+ "invoice_date": {"type": "string"},
37
+ "customer_name": {"type": "string"},
38
+ "phone": {"type": "string"},
39
+ "total_amount": {"type": "number"},
40
+ },
41
+ "required": ["gstin", "invoice_number", "invoice_date", "total_amount"],
42
+ "additionalProperties": False,
43
+ },
44
+ },
45
+ {
46
+ "text": (
47
+ "BANK STATEMENT\n"
48
+ "Account Holder: Priya Verma\n"
49
+ "Account No: 001234567890\n"
50
+ "IFSC: HDFC0001234\n"
51
+ "Period: 01/01/2025 - 31/01/2025\n"
52
+ "Closing Balance: INR 54,210.75\n"
53
+ ),
54
+ "schema": {
55
+ "type": "object",
56
+ "properties": {
57
+ "account_holder": {"type": "string"},
58
+ "account_number": {"type": "string"},
59
+ "ifsc": {"type": "string"},
60
+ "period_start": {"type": "string"},
61
+ "period_end": {"type": "string"},
62
+ "closing_balance": {"type": "number"},
63
+ },
64
+ "required": ["account_number", "ifsc", "closing_balance"],
65
+ "additionalProperties": False,
66
+ },
67
+ },
68
+ ]
69
+ return docs
70
+
71
+
72
+ def _load_corpus_if_available():
73
+ try:
74
+ from data.corpus import DocumentCorpus
75
+
76
+ corpus = DocumentCorpus(split="holdout")
77
+ # Smoke check (can still be empty)
78
+ _ = corpus.sample()
79
+ return corpus
80
+ except Exception:
81
+ return None
82
+
83
+
84
+ executor = AdversaryEditExecutor()
85
+ corpus = _load_corpus_if_available()
86
+ fallback_docs = _fallback_docs()
87
+ fallback_idx = 0
88
+
89
+
90
+ def load_random_doc():
91
+ global fallback_idx
92
+ if corpus is not None:
93
+ doc = corpus.sample()
94
+ return doc["text"], json.dumps(doc["schema"], indent=2)
95
+
96
+ doc = fallback_docs[fallback_idx % len(fallback_docs)]
97
+ fallback_idx += 1
98
+ return doc["text"], json.dumps(doc["schema"], indent=2)
99
+
100
+
101
+ def apply_perturbation(doc_text, schema_text, edit_intensity):
102
+ try:
103
+ schema = json.loads(schema_text)
104
+ except Exception:
105
+ schema = {}
106
+
107
+ edit = AdversaryEdit(
108
+ edit_type="ocr_noise",
109
+ params={"intensity": float(edit_intensity)},
110
+ token_cost=10, # corrected by model validator
111
+ )
112
+ mod_doc, mod_schema = executor.apply_edits(doc_text, schema, [edit])
113
+
114
+ return mod_doc, json.dumps(mod_schema, indent=2)
115
+
116
+
117
+ def extract_data(doc_text, schema_text):
118
+ """
119
+ Placeholder extractor.
120
+
121
+ For a fully aligned repo demo, replace this function with a call to:
122
+ - your hosted extractor model (HF Inference), or
123
+ - a local model in the Space (GPU Space).
124
+ """
125
+ try:
126
+ schema = json.loads(schema_text)
127
+ extracted = {}
128
+ if isinstance(schema, dict) and "properties" in schema and isinstance(schema["properties"], dict):
129
+ for k in schema["properties"].keys():
130
+ extracted[k] = "[Extracted Value]"
131
+ return json.dumps(extracted, indent=2)
132
+ except Exception:
133
+ return "{}"
134
+
135
+
136
+ with gr.Blocks(title="Adversarial Extraction Arena") as demo:
137
+ gr.Markdown("# Adversarial Structured-Extraction Arena")
138
+ gr.Markdown("Agent A perturbs documents. Agent E extracts structured data despite the noise.")
139
+
140
+ with gr.Row():
141
+ with gr.Column():
142
+ gr.Markdown("### Original Document")
143
+ doc_input = gr.TextArea(label="Document Text", lines=10)
144
+ schema_input = gr.TextArea(label="Target Schema", lines=10)
145
+ load_btn = gr.Button("Load Random Document")
146
+
147
+ with gr.Column():
148
+ gr.Markdown("### Adversary (Agent A)")
149
+ intensity_slider = gr.Slider(
150
+ minimum=0.0, maximum=1.0, value=0.2, step=0.1, label="Noise Intensity"
151
+ )
152
+ perturb_btn = gr.Button("Apply Perturbation")
153
+ mod_doc_output = gr.TextArea(label="Perturbed Document", lines=10)
154
+
155
+ with gr.Column():
156
+ gr.Markdown("### Extractor (Agent E)")
157
+ extract_btn = gr.Button("Run Extractor")
158
+ extracted_output = gr.TextArea(label="Extracted JSON", lines=10)
159
+
160
+ load_btn.click(fn=load_random_doc, inputs=[], outputs=[doc_input, schema_input])
161
+ perturb_btn.click(
162
+ fn=apply_perturbation,
163
+ inputs=[doc_input, schema_input, intensity_slider],
164
+ outputs=[mod_doc_output, schema_input],
165
+ )
166
+ extract_btn.click(fn=extract_data, inputs=[mod_doc_output, schema_input], outputs=[extracted_output])
167
+
168
+
169
+ if __name__ == "__main__":
170
+ demo.launch(server_name="0.0.0.0", server_port=7860)
171
+
data/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __all__ = []
data/corpus.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class DocumentCorpus:
7
+ """Loads and manages the document corpus for training and evaluation."""
8
+
9
+ def __init__(self, data_file: str = "data/corpus.json", split: str = "train"):
10
+ self.data_file = data_file
11
+ self.split = split
12
+ self.documents = self._load_data()
13
+
14
+ def _load_data(self) -> List[Dict[str, Any]]:
15
+ try:
16
+ with open(self.data_file, "r", encoding="utf-8") as f:
17
+ all_docs = json.load(f)
18
+ # Filter by split
19
+ docs = [doc for doc in all_docs if doc.get("split", "train") == self.split]
20
+ if not docs:
21
+ print(f"Warning: No documents found for split '{self.split}'.")
22
+ return docs
23
+ except FileNotFoundError:
24
+ raise FileNotFoundError(
25
+ f"Corpus file {self.data_file} not found. Please run 'python data/generator.py' first."
26
+ )
27
+
28
+ def sample(self) -> Dict[str, Any]:
29
+ """Returns a random document from the corpus."""
30
+ if not self.documents:
31
+ raise ValueError(f"Corpus is empty for split {self.split}.")
32
+ return random.choice(self.documents)
33
+
34
+ def get_all(self) -> List[Dict[str, Any]]:
35
+ """Returns all documents in this split (useful for eval)."""
36
+ return self.documents
37
+
env/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __all__ = []
env/adversary.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import random
3
+ from typing import Tuple
4
+
5
+ # OCR confusion map — realistic character substitutions
6
+ OCR_MAP = {
7
+ "0": "O",
8
+ "O": "0",
9
+ "1": "l",
10
+ "l": "1",
11
+ "I": "1",
12
+ "5": "S",
13
+ "S": "5",
14
+ "8": "B",
15
+ "B": "8",
16
+ "6": "G",
17
+ "rn": "m",
18
+ "vv": "w",
19
+ "cl": "d",
20
+ }
21
+
22
+
23
+ class AdversaryEditExecutor:
24
+ """Applies structured edit programs to document text and schema."""
25
+
26
+ def apply_edits(self, document: str, schema: dict, edits: list) -> Tuple[str, dict]:
27
+ """Apply all edits in sequence. Returns (modified_doc, modified_schema)."""
28
+ doc, sch = document, schema.copy()
29
+ # Ensure deep copy of properties
30
+ if "properties" in sch:
31
+ sch["properties"] = sch["properties"].copy()
32
+ if "required" in sch:
33
+ sch["required"] = sch["required"].copy()
34
+
35
+ for edit in edits:
36
+ doc, sch = self.apply_single_edit(doc, sch, edit)
37
+ return doc, sch
38
+
39
+ def apply_single_edit(self, doc: str, schema: dict, edit) -> Tuple[str, dict]:
40
+ t = edit.edit_type.value
41
+ p = edit.params
42
+ try:
43
+ if t == "rename_field":
44
+ return self.rename_field(doc, schema, p.get("old_name", ""), p.get("new_name", ""))
45
+ elif t == "swap_type":
46
+ return self.swap_type(doc, schema, p.get("field", ""), p.get("new_type", "string"))
47
+ elif t == "inject_distractor":
48
+ return self.inject_distractor(doc, schema, p.get("content", ""))
49
+ elif t == "mutate_format":
50
+ return self.mutate_format(doc, schema, p.get("field", ""), p.get("pattern", ""))
51
+ elif t == "add_required_field":
52
+ return self.add_required_field(doc, schema, p.get("name", ""), p.get("value", ""))
53
+ elif t == "ocr_noise":
54
+ return self.ocr_noise(doc, schema, float(p.get("intensity", 0.3)))
55
+ elif t == "swap_columns":
56
+ return self.swap_columns(doc, schema, p.get("col_a", 0), p.get("col_b", 1))
57
+ except Exception:
58
+ # Silently fail on bad adversary actions
59
+ pass
60
+ return doc, schema
61
+
62
+ def rename_field(self, doc: str, schema: dict, old_name: str, new_name: str) -> Tuple[str, dict]:
63
+ if not old_name or not new_name:
64
+ return doc, schema
65
+ # Replace old_name with new_name in doc text (case-insensitive, word-boundary safe)
66
+ pattern = re.compile(r"\b" + re.escape(old_name) + r"\b", re.IGNORECASE)
67
+ new_doc = pattern.sub(new_name, doc)
68
+
69
+ # Also update schema: rename the property key
70
+ new_schema = schema
71
+ if "properties" in new_schema and old_name in new_schema["properties"]:
72
+ prop = new_schema["properties"].pop(old_name)
73
+ new_schema["properties"][new_name] = prop
74
+
75
+ if "required" in new_schema and old_name in new_schema["required"]:
76
+ new_schema["required"].remove(old_name)
77
+ new_schema["required"].append(new_name)
78
+
79
+ return new_doc, new_schema
80
+
81
+ def swap_type(self, doc: str, schema: dict, field: str, new_type: str) -> Tuple[str, dict]:
82
+ if "properties" in schema and field in schema["properties"]:
83
+ schema["properties"][field]["type"] = new_type
84
+ return doc, schema
85
+
86
+ def ocr_noise(self, doc: str, schema: dict, intensity: float) -> Tuple[str, dict]:
87
+ # Apply OCR_MAP substitutions to `intensity` fraction of eligible chars
88
+ intensity = max(0.0, min(1.0, intensity))
89
+ chars = list(doc)
90
+ for i, char in enumerate(chars):
91
+ if random.random() < intensity:
92
+ if char in OCR_MAP:
93
+ chars[i] = OCR_MAP[char]
94
+ elif char.isdigit() and random.random() < 0.1:
95
+ chars[i] = char + " " if random.random() > 0.5 else ""
96
+ return "".join(chars), schema
97
+
98
+ def mutate_format(self, doc: str, schema: dict, field: str, pattern: str) -> Tuple[str, dict]:
99
+ if pattern == "date_dmy_to_mdy":
100
+ doc = re.sub(r"(\d{2})/(\d{2})/(\d{4})", r"\2-\1-\3", doc)
101
+ elif pattern == "date_dmy_to_iso":
102
+ doc = re.sub(r"(\d{2})/(\d{2})/(\d{4})", r"\3-\2-\1", doc)
103
+ elif pattern == "currency_symbol_to_text":
104
+ doc = doc.replace("₹", "INR ")
105
+ elif pattern == "phone_compact_to_dashed":
106
+ doc = re.sub(
107
+ r"(?<!\d)(\d{10})(?!\d)",
108
+ r"\g<1>"[:3] + "-" + r"\g<1>"[3:6] + "-" + r"\g<1>"[6:],
109
+ doc,
110
+ )
111
+ return doc, schema
112
+
113
+ def inject_distractor(self, doc: str, schema: dict, content: str) -> Tuple[str, dict]:
114
+ if not content:
115
+ content = "Random distractor line item that means nothing."
116
+ lines = doc.split("\n")
117
+ # Insert near the end but not at the very end
118
+ idx = max(0, len(lines) - 2 - random.randint(0, 3))
119
+ lines.insert(idx, content)
120
+ return "\n".join(lines), schema
121
+
122
+ def add_required_field(self, doc: str, schema: dict, name: str, value: str) -> Tuple[str, dict]:
123
+ if not name:
124
+ return doc, schema
125
+ # Append "name: value" to document
126
+ doc += f"\n{name}: {value}"
127
+ # AND add name to schema required fields
128
+ if "properties" in schema:
129
+ schema["properties"][name] = {"type": "string"}
130
+ if "required" in schema:
131
+ schema["required"].append(name)
132
+ return doc, schema
133
+
134
+ def swap_columns(self, doc: str, schema: dict, col_a: int, col_b: int) -> Tuple[str, dict]:
135
+ # Too complex for quick implementation, just return doc for now
136
+ # Would parse tables and swap
137
+ return doc, schema
138
+
139
+ def is_document_parseable(self, original_doc: str, modified_doc: str) -> bool:
140
+ # Fail if more than 40% of key:value patterns from original are unrecognizable
141
+ orig_matches = len(re.findall(r"[\w\s]+:\s*[\w\s₹/\-\.,]+", original_doc))
142
+ mod_matches = len(re.findall(r"[\w\s]+:\s*[\w\s₹/\-\.,]+", modified_doc))
143
+
144
+ if orig_matches == 0:
145
+ return True # Not applicable
146
+
147
+ return (mod_matches / orig_matches) >= 0.6
148
+
149
+ def validate_budget(self, edits: list, budget: int) -> bool:
150
+ return sum(e.token_cost for e in edits) <= budget
151
+
env/models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from pydantic import BaseModel, Field, model_validator
5
+
6
+
7
+ class EditType(str, Enum):
8
+ rename_field = "rename_field"
9
+ swap_type = "swap_type"
10
+ inject_distractor = "inject_distractor"
11
+ mutate_format = "mutate_format"
12
+ add_required_field = "add_required_field"
13
+ ocr_noise = "ocr_noise"
14
+ swap_columns = "swap_columns"
15
+
16
+
17
+ EDIT_TOKEN_COSTS = {
18
+ "rename_field": 10,
19
+ "swap_type": 15,
20
+ "inject_distractor": 25,
21
+ "mutate_format": 10,
22
+ "add_required_field": 20,
23
+ "ocr_noise": 5,
24
+ "swap_columns": 15,
25
+ }
26
+
27
+
28
+ class AdversaryEdit(BaseModel):
29
+ edit_type: EditType
30
+ params: Dict[str, Any]
31
+ token_cost: int
32
+
33
+ @model_validator(mode="after")
34
+ def validate_cost(self):
35
+ expected = EDIT_TOKEN_COSTS[self.edit_type.value]
36
+ if self.token_cost != expected:
37
+ self.token_cost = expected
38
+ return self
39
+
40
+
41
+ class ExtractorAction(BaseModel):
42
+ extracted_json: Dict[str, Any]
43
+ drift_detected: Optional[List[Dict[str, str]]] = None # [{"field": str, "reason": str}]
44
+ confidence: float = Field(ge=0.0, le=1.0, default=0.5)
45
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.20.0
2
+ pydantic>=2.0.0