kenmandal commited on
Commit
082d661
·
verified ·
1 Parent(s): ae053b7

Deploy latest: ERP DocIQ NLQ chatbot + reasoning models (MiniCPM3-4B/Command R7B) + ERP fine-tuning + extreme OCR docs

Browse files
Files changed (46) hide show
  1. .gitattributes +3 -0
  2. README.md +30 -16
  3. backend/app/auth.py +4 -5
  4. backend/app/config.py +16 -0
  5. backend/app/erp/__init__.py +11 -0
  6. backend/app/erp/chat.py +347 -0
  7. backend/app/erp/data.py +275 -0
  8. backend/app/erp/finetune.py +259 -0
  9. backend/app/extraction_heuristics.py +6 -1
  10. backend/app/main.py +92 -0
  11. backend/app/models_registry.py +102 -0
  12. backend/app/ocr/quality.py +207 -0
  13. backend/app/pipeline/nodes.py +27 -3
  14. backend/app/prompts/__init__.py +10 -2
  15. backend/app/providers/blackforest.py +70 -0
  16. backend/evals/datasets/extreme_contract_fax.gt.json +20 -0
  17. backend/evals/datasets/extreme_contract_fax.png +3 -0
  18. backend/evals/datasets/extreme_contract_fax.txt +27 -0
  19. backend/evals/datasets/extreme_po_collage.gt.json +40 -0
  20. backend/evals/datasets/extreme_po_collage.png +3 -0
  21. backend/evals/datasets/extreme_po_collage.txt +20 -0
  22. backend/evals/datasets/extreme_receipt_photo.gt.json +36 -0
  23. backend/evals/datasets/extreme_receipt_photo.png +3 -0
  24. backend/evals/datasets/extreme_receipt_photo.txt +17 -0
  25. backend/evals/ocr_backend_report.json +135 -0
  26. backend/evals/ocr_quality_report.json +313 -0
  27. backend/evals/report.json +980 -0
  28. backend/evals/run.py +147 -0
  29. backend/evals/scorers.py +166 -0
  30. backend/finetune/erp_finetune_report.json +106 -0
  31. backend/finetune/erp_sft.jsonl +120 -0
  32. backend/finetune/runs/hf_20260612T212346.json +120 -0
  33. backend/finetune/runs/local_20260612T212257.json +108 -0
  34. backend/finetune/runs/local_20260612T212332.json +106 -0
  35. backend/finetune/runs/local_20260612T212357.json +108 -0
  36. backend/finetune/runs/local_20260612T212413.json +106 -0
  37. gradio_app.py +44 -0
  38. results/erp_finetune_report.json +106 -0
  39. results/erp_sft.jsonl +120 -0
  40. results/ocr_quality_report.json +313 -0
  41. scripts/finetune_erp.py +153 -0
  42. scripts/generate_extreme_docs.py +421 -0
  43. scripts/ocr_quality.py +67 -0
  44. scripts/ocr_smoke.py +54 -0
  45. scripts/run_dev.sh +35 -0
  46. scripts/test_ocr.py +57 -0
.gitattributes CHANGED
@@ -34,3 +34,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  backend/evals/datasets/complex_invoice_messy.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  backend/evals/datasets/complex_invoice_messy.png filter=lfs diff=lfs merge=lfs -text
37
+ backend/evals/datasets/extreme_contract_fax.png filter=lfs diff=lfs merge=lfs -text
38
+ backend/evals/datasets/extreme_po_collage.png filter=lfs diff=lfs merge=lfs -text
39
+ backend/evals/datasets/extreme_receipt_photo.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -8,28 +8,42 @@ sdk_version: 6.9.0
8
  app_file: gradio_app.py
9
  pinned: false
10
  license: mit
11
- short_description: Agentic OCR + IDP for retail docs (MiniCPM-V 8B, Tesseract)
12
  ---
13
 
14
- # ERP-DocIQ — Agentic OCR + Document Intelligence for Retail Back-Office
15
 
16
- Open-source **Intelligent Document Processing** for orders, receipts, invoices, contracts and
17
- subscription memos a UiPath-style IDP rebuilt on **small models** and pluggable OCR.
18
 
19
- - **Pluggable OCR backends:** **MiniCPM-V-4.6** (≤32B small VLM, via the OpenBMB/ModelBest API),
20
- **Tesseract** (real OCR, via `packages.txt`), and an offline sidecar fallback auto-fallback chain.
21
- - **Hybrid pipeline:** OCR classify extract normalize → enrich (RAG) validate → post/HITL.
22
- - **Real vector RAG** (persistent), business KPIs, and a built-in **OCR self-test** that runs each
23
- backend on real scanned images.
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  ## Use it
26
- 1. Open the **Process a document** tab.
27
- 2. Pick an **OCR backend** (`auto`, `minicpm`, `tesseract`, …) and a sample, or upload your own PDF/PNG.
28
- 3. See the extracted multi-layer fields + live KPIs. The **Search (RAG)** tab does semantic search.
 
 
 
 
29
 
30
  ## Configure (Space → Settings → Variables and secrets)
31
  - `MINICPM_BASE_URL=https://api.modelbest.cn/v1`, `MINICPM_API_KEY=…`, `MINICPM_MODEL=MiniCPM-V-4.6-Instruct`
32
- - Tesseract works out of the box (installed via `packages.txt`).
33
-
34
- Built for the **Build Small Hackathon** (small models, on Gradio). Uses MiniCPM-V (~8B) as the
35
- load-bearing OCR model.
 
8
  app_file: gradio_app.py
9
  pinned: false
10
  license: mit
11
+ short_description: OCR/IDP + ERP NLQ chatbot on small models (MiniCPM)
12
  ---
13
 
14
+ # ERP-DocIQ — Agentic Document Intelligence + ERP NLQ, on small models
15
 
16
+ An open-source, UiPath-style back-office automation stack built entirely on **small models
17
+ (≤32B)**for the **Build Small Hackathon**. Three things, one app:
18
 
19
+ 1. **Read any document (OCR + IDP).** Hybrid pipeline (OCR classify extract normalize →
20
+ enrich/RAG validate post/HITL) reads orders, receipts, invoices, contracts and complex
21
+ forms even messy scans with **OpenBMB MiniCPM-V-4.6** (≤32B VLM) and **Tesseract**.
22
+ 2. **Ask your ERP reports (ERP DocIQ).** A chatbot over a simulated retail ERP knowledgebase
23
+ (vendors · POs · invoices · GL · inventory · returns). Natural-language **NLQ → SQL**,
24
+ analytics, summaries and **"why"** reasoning — every figure comes from **real SQL over the
25
+ data**; **OpenBMB MiniCPM3-4B** only phrases the answer, it never invents numbers.
26
+ 3. **Adapt to your domain (fine-tuning).** A LoRA recipe fine-tunes **MiniCPM3-4B** on an ERP
27
+ instruction dataset; an offline CPU demo trains the NLQ-routing head on the same data with a
28
+ real before→after gain (**8.3% → 91.7%**). See `results/erp_finetune_report.json`.
29
+
30
+ ## Small models (≤32B) — by job
31
+ | Lab | Models | Role |
32
+ |---|---|---|
33
+ | **OpenBMB** | MiniCPM-V-4.6 · MiniCPM-o-4.5 · **MiniCPM3-4B** | OCR/VLM · **reasoning · NLQ→SQL · fine-tune target** |
34
+ | **Cohere** | Aya-Vision-8B/32B · **Command R7B** | OCR/VQA · **RAG · NLQ · reasoning** |
35
+ | **Black Forest Labs** | FLUX.1 [dev]/[schnell] | image generation → synthetic test docs (not OCR) |
36
 
37
  ## Use it
38
+ - **Process a document** — pick an OCR backend (`auto`, `minicpm`, `tesseract`) + a sample (or upload), see multi-layer extracted fields + KPIs.
39
+ - **ERP DocIQ (chat)** ask "Why did spend rise in Q2 2026?", "Top vendors by spend", "late-payment rate"; see the grounded answer, SQL, and the fine-tuning panel.
40
+ - **Search (RAG)** semantic vendor-master retrieval. **Web Automation** multi-step browser flow.
41
+
42
+ ## Published results (`results/`)
43
+ - `ocr_quality_report.json` — OCR CER/WER + field accuracy (MiniCPM-V **CER 2.6%** vs Tesseract 14.7%).
44
+ - `erp_finetune_report.json` + `erp_sft.jsonl` — fine-tune metrics + the instruction dataset.
45
 
46
  ## Configure (Space → Settings → Variables and secrets)
47
  - `MINICPM_BASE_URL=https://api.modelbest.cn/v1`, `MINICPM_API_KEY=…`, `MINICPM_MODEL=MiniCPM-V-4.6-Instruct`
48
+ - Tesseract is installed via `packages.txt`. **Without a key the app still runs** — ERP DocIQ uses
49
+ its deterministic SQL engine and OCR falls back to the sidecar, so every tab works offline.
 
 
backend/app/auth.py CHANGED
@@ -36,10 +36,9 @@ def make_auth_middleware(user: str, pwd: str):
36
  return await call_next(request)
37
  if _check(request.headers.get("authorization"), user, pwd):
38
  return await call_next(request)
39
- return JSONResponse(
40
- {"detail": "Authentication required"},
41
- status_code=401,
42
- headers={"WWW-Authenticate": 'Basic realm="Aperture"'},
43
- )
44
 
45
  return auth_middleware
 
36
  return await call_next(request)
37
  if _check(request.headers.get("authorization"), user, pwd):
38
  return await call_next(request)
39
+ # NOTE: deliberately NO `WWW-Authenticate: Basic` header — that triggers the
40
+ # browser's native credential popup. The React SPA handles 401 itself and
41
+ # shows its own login screen, so we return a plain JSON 401.
42
+ return JSONResponse({"detail": "Authentication required"}, status_code=401)
 
43
 
44
  return auth_middleware
backend/app/config.py CHANGED
@@ -131,6 +131,17 @@ class Settings:
131
  self.llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY") or None
132
  self.llamaparse_result_type = os.getenv("LLAMAPARSE_RESULT_TYPE", "markdown")
133
 
 
 
 
 
 
 
 
 
 
 
 
134
  # --- databases --------------------------------------------------------
135
  appdb = os.getenv("APP_DB_PATH")
136
  self.app_db_path = (
@@ -142,6 +153,11 @@ class Settings:
142
  (Path(ragdb) if Path(ragdb).is_absolute() else BACKEND_DIR / ragdb)
143
  if ragdb else self.writable_dir / "rag.db"
144
  )
 
 
 
 
 
145
 
146
  # --- browser ---
147
  self.playwright_headless = _bool("PLAYWRIGHT_HEADLESS", True)
 
131
  self.llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY") or None
132
  self.llamaparse_result_type = os.getenv("LLAMAPARSE_RESULT_TYPE", "markdown")
133
 
134
+ # --- model labs (all ≤32B params — "small models") --------------------
135
+ # OpenBMB (MiniCPM family) — text/vision reasoning + OCR (via MINICPM_* above)
136
+ self.openbmb_model = os.getenv("OPENBMB_MODEL", self.minicpm_model)
137
+ # OpenBMB MiniCPM3-4B — text reasoning / NLQ→SQL / summarization (ERP DocIQ + fine-tune target)
138
+ self.openbmb_reasoner_model = os.getenv("OPENBMB_REASONER_MODEL", "MiniCPM3-4B")
139
+ # Black Forest Labs (FLUX) — image GENERATION for synthetic test documents
140
+ self.bfl_api_key = os.getenv("BFL_API_KEY") or None
141
+ self.bfl_model = os.getenv("BFL_MODEL", "flux-dev") # api: flux-dev | flux-pro-1.1 | flux-schnell
142
+ # Cohere hosted API (in addition to the local HF Aya-Vision backend above)
143
+ self.cohere_api_key = os.getenv("COHERE_API_KEY") or None
144
+
145
  # --- databases --------------------------------------------------------
146
  appdb = os.getenv("APP_DB_PATH")
147
  self.app_db_path = (
 
153
  (Path(ragdb) if Path(ragdb).is_absolute() else BACKEND_DIR / ragdb)
154
  if ragdb else self.writable_dir / "rag.db"
155
  )
156
+ erpdb = os.getenv("ERP_DB_PATH")
157
+ self.erp_db_path = (
158
+ (Path(erpdb) if Path(erpdb).is_absolute() else BACKEND_DIR / erpdb)
159
+ if erpdb else self.writable_dir / "erp.db"
160
+ )
161
 
162
  # --- browser ---
163
  self.playwright_headless = _bool("PLAYWRIGHT_HEADLESS", True)
backend/app/erp/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simulated ERP knowledgebase + the ERP DocIQ chatbot (NLQ, summary, reasons, analytics).
2
+
3
+ A deterministic, offline-first stand-in for a real retail ERP (SAP/Oracle/NetSuite).
4
+ `data.py` seeds a realistic SQLite warehouse; `chat.py` answers natural-language
5
+ questions over it (text-to-SQL NLQ + analytics + summarization), routed to a small
6
+ reasoning model (OpenBMB MiniCPM3-4B) with a deterministic offline fallback.
7
+ """
8
+ from .data import ErpWarehouse, ERP_SCHEMA_DOC, get_warehouse
9
+ from .chat import ErpChat, answer_question
10
+
11
+ __all__ = ["ErpWarehouse", "ERP_SCHEMA_DOC", "get_warehouse", "ErpChat", "answer_question"]
backend/app/erp/chat.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ERP DocIQ chatbot — ask questions over the simulated ERP knowledgebase.
2
+
3
+ Four capabilities, all grounded in real data from the warehouse (no hallucinated
4
+ figures):
5
+ • NLQ — natural-language → SQL → rows (text-to-SQL)
6
+ • analytics — aggregations / rankings / rates
7
+ • summary — narrative roll-up of a report
8
+ • reasons — "why" questions, explained from the underlying data
9
+
10
+ Design for offline-first integrity: a deterministic intent+SQL library always
11
+ produces the correct numbers (real SQL over the warehouse). When a small reasoning
12
+ model (OpenBMB MiniCPM3-4B, routed via ModelRouter) is available, it is used to
13
+ (a) generate SQL for questions outside the deterministic library, and (b) phrase
14
+ the final answer / explanation — always over the real computed rows, so the model
15
+ narrates facts rather than inventing them.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ import time
21
+ from typing import Callable, Optional
22
+
23
+ from ..observability import log_event
24
+ from .data import ERP_SCHEMA_DOC, EXAMPLE_QUESTIONS, get_warehouse
25
+
26
+ # ── deterministic intent → (sql, intent, narrator) library ────────────────────
27
+ # Each entry: keywords (any-match scoring), a SQL builder, an intent label, and a
28
+ # narrator that turns the result rows into a baseline natural-language answer.
29
+
30
+
31
+ def _fmt_usd(x) -> str:
32
+ try:
33
+ return f"${float(x):,.0f}"
34
+ except (TypeError, ValueError):
35
+ return str(x)
36
+
37
+
38
+ def _q_spend_by_month(wh):
39
+ sql = ("SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries "
40
+ "GROUP BY period ORDER BY period")
41
+ cols, rows = wh.query(sql)
42
+ total = sum(r[1] for r in rows)
43
+ peak = max(rows, key=lambda r: r[1]) if rows else None
44
+ ans = (f"Total invoiced spend across {len(rows)} months is {_fmt_usd(total)}. "
45
+ f"The peak month was {peak[0]} at {_fmt_usd(peak[1])}." if peak else "No spend recorded.")
46
+ return sql, cols, rows, ans
47
+
48
+
49
+ def _q_top_vendors(wh):
50
+ sql = ("SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices "
51
+ "FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id "
52
+ "GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5")
53
+ cols, rows = wh.query(sql)
54
+ lead = rows[0] if rows else None
55
+ ans = (f"Top vendor by spend is {lead[0]} at {_fmt_usd(lead[1])} across {lead[2]} invoices. "
56
+ f"The top 5 account for {_fmt_usd(sum(r[1] for r in rows))}." if lead else "No vendors.")
57
+ return sql, cols, rows, ans
58
+
59
+
60
+ def _q_late_vendors(wh):
61
+ sql = ("SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) "
62
+ "THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices "
63
+ "FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id "
64
+ "WHERE i.status='paid' GROUP BY v.vendor_id "
65
+ "HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5")
66
+ cols, rows = wh.query(sql)
67
+ lead = rows[0] if rows else None
68
+ ans = (f"{lead[0]} had the most late payments ({lead[1]} of {lead[2]} paid invoices). "
69
+ "Late = paid after the vendor's net terms." if lead else "No late payments found.")
70
+ return sql, cols, rows, ans
71
+
72
+
73
+ def _q_late_rate(wh):
74
+ sql = ("SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) "
75
+ "THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices "
76
+ "FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'")
77
+ cols, rows = wh.query(sql)
78
+ r = rows[0] if rows else None
79
+ ans = (f"The overall late-payment rate is {r[0]}% across {r[1]} paid invoices."
80
+ if r else "No paid invoices.")
81
+ return sql, cols, rows, ans
82
+
83
+
84
+ def _q_spend_by_category(wh):
85
+ sql = ("SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l "
86
+ "JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id "
87
+ "WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC")
88
+ cols, rows = wh.query(sql)
89
+ lead = rows[0] if rows else None
90
+ ans = (f"{lead[0]} is the largest category at {_fmt_usd(lead[1])}, "
91
+ f"out of {_fmt_usd(sum(r[1] for r in rows))} total." if lead else "No spend.")
92
+ return sql, cols, rows, ans
93
+
94
+
95
+ def _q_why_q2(wh):
96
+ sql = ("SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries "
97
+ "WHERE period >= '2026-04' AND period <= '2026-06' "
98
+ "GROUP BY period, account ORDER BY period, spend DESC")
99
+ cols, rows = wh.query(sql)
100
+ # compare fixtures share vs rest
101
+ fx = sum(r[2] for r in rows if "Fit-Out" in (r[1] or ""))
102
+ tot = sum(r[2] for r in rows)
103
+ share = round(100 * fx / tot, 1) if tot else 0
104
+ ans = (f"Q2 2026 spend was {_fmt_usd(tot)}, of which the Store-Fit-Out (Fixtures) "
105
+ f"account was {_fmt_usd(fx)} — {share}% of the quarter. The rise is driven by a "
106
+ "store-remodel program: more Fixtures POs at higher quantities.")
107
+ return sql, cols, rows, ans
108
+
109
+
110
+ def _q_below_reorder(wh):
111
+ sql = ("SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i "
112
+ "JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point "
113
+ "ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15")
114
+ cols, rows = wh.query(sql)
115
+ ans = (f"{len(rows)} SKU/region positions are below reorder point and need replenishment."
116
+ if rows else "All inventory is above reorder point.")
117
+ return sql, cols, rows, ans
118
+
119
+
120
+ def _q_open_invoices(wh):
121
+ sql = ("SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'")
122
+ cols, rows = wh.query(sql)
123
+ r = rows[0] if rows else None
124
+ ans = (f"There is {_fmt_usd(r[0])} in open (unpaid) invoices across {r[1]} invoices."
125
+ if r and r[0] else "No open invoices.")
126
+ return sql, cols, rows, ans
127
+
128
+
129
+ def _q_return_reasons(wh):
130
+ sql = ("SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds "
131
+ "FROM returns GROUP BY reason ORDER BY refunds DESC")
132
+ cols, rows = wh.query(sql)
133
+ lead = rows[0] if rows else None
134
+ ans = (f"'{lead[0]}' drives the most refunds at {_fmt_usd(lead[2])} ({lead[1]} returns)."
135
+ if lead else "No returns.")
136
+ return sql, cols, rows, ans
137
+
138
+
139
+ def _q_ap_health(wh):
140
+ # composite — used by the "summarize AP health" ask
141
+ late = _q_late_rate(wh)[3]
142
+ openv = _q_open_invoices(wh)[3]
143
+ topv = _q_top_vendors(wh)[3]
144
+ sql = ("SELECT (SELECT COUNT(*) FROM invoices) AS invoices, "
145
+ "(SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, "
146
+ "(SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay")
147
+ cols, rows = wh.query(sql)
148
+ ans = f"AP health: {late} {openv} Avg days-to-pay is {rows[0][2]}. {topv}"
149
+ return sql, cols, rows, ans
150
+
151
+
152
+ # (keywords, builder, intent)
153
+ _LIBRARY: list[tuple[list[str], Callable, str]] = [
154
+ (["ap health", "accounts payable health", "summarize ap", "payables health", "ap summary"], _q_ap_health, "summary"),
155
+ (["why", "spike", "rise", "increase", "q2", "remodel", "driver"], _q_why_q2, "reasons"),
156
+ (["spend by month", "monthly spend", "spend per month", "spend by period", "total spend"], _q_spend_by_month, "analytics"),
157
+ (["top vendor", "biggest vendor", "vendor by spend", "largest vendor", "top 5 vendor"], _q_top_vendors, "analytics"),
158
+ (["late", "overdue", "paid late", "slow pay"], _q_late_vendors, "analytics"),
159
+ (["late rate", "late-payment rate", "on-time", "on time rate"], _q_late_rate, "analytics"),
160
+ (["spend by category", "category spend", "by category"], _q_spend_by_category, "analytics"),
161
+ (["reorder", "below reorder", "replenish", "stockout", "low stock"], _q_below_reorder, "analytics"),
162
+ (["open invoice", "unpaid", "outstanding", "open ap"], _q_open_invoices, "analytics"),
163
+ (["return", "refund", "rma"], _q_return_reasons, "analytics"),
164
+ ]
165
+
166
+
167
+ def _match(question: str) -> Optional[tuple[Callable, str, int]]:
168
+ """Pick a deterministic template only when a real keyphrase is present.
169
+
170
+ Score = 3 × (words in matched keyphrases), so a longer, more specific phrase
171
+ ("late-payment rate") beats a bare token ("late"), and a question with *no*
172
+ library keyphrase scores 0 → it falls through to LLM text-to-SQL instead of
173
+ being force-fit to the nearest template.
174
+ """
175
+ q = question.lower()
176
+ best, best_score, best_intent = None, 0, ""
177
+ for keys, fn, intent in _LIBRARY:
178
+ phrase_words = sum(len(k.split()) for k in keys if k in q)
179
+ distinct = {w for k in keys for w in k.split() if len(w) > 3}
180
+ overlap = sum(1 for w in distinct if w in q)
181
+ score = phrase_words * 3 + (1 if overlap >= 2 else 0)
182
+ if score > best_score:
183
+ best, best_score, best_intent = fn, score, intent
184
+ if best and best_score >= 3: # >=3 ⇒ at least one full keyphrase matched
185
+ return best, best_intent, best_score
186
+ return None
187
+
188
+
189
+ def _intent_of(question: str) -> str:
190
+ q = question.lower()
191
+ if any(w in q for w in ("why", "reason", "explain", "driver", "cause")):
192
+ return "reasons"
193
+ if any(w in q for w in ("summar", "overview", "health", "how are")):
194
+ return "summary"
195
+ if any(w in q for w in ("how many", "total", "average", "rate", "top", "rank", "by month",
196
+ "by category", "count", "sum")):
197
+ return "analytics"
198
+ return "nlq"
199
+
200
+
201
+ class ErpChat:
202
+ def __init__(self, settings, router=None, warehouse=None, metrics=None, db=None) -> None:
203
+ self.settings = settings
204
+ self.router = router
205
+ self.wh = warehouse or get_warehouse(settings)
206
+ self.metrics = metrics
207
+ self.db = db
208
+
209
+ # --- public ---------------------------------------------------------------
210
+ def answer(self, question: str, use_llm: bool = True, run_id: str = "erp-chat") -> dict:
211
+ t0 = time.perf_counter()
212
+ question = (question or "").strip()
213
+ if not question:
214
+ return {"answer": "Ask me about ERP spend, vendors, payments, inventory or returns.",
215
+ "intent": "help", "examples": EXAMPLE_QUESTIONS}
216
+
217
+ engine = "deterministic"
218
+ model = None
219
+ cost = 0.0
220
+ sql = cols = rows = None
221
+ baseline = ""
222
+ intent = _intent_of(question)
223
+
224
+ m = _match(question)
225
+ if m:
226
+ fn, intent, _score = m
227
+ try:
228
+ sql, cols, rows, baseline = fn(self.wh)
229
+ except Exception as e:
230
+ log_event("error", "ERP deterministic query failed", error=str(e), q=question)
231
+ baseline = f"Query error: {e}"
232
+ elif use_llm and self._llm_available():
233
+ # text-to-SQL for questions outside the deterministic library
234
+ sql, cols, rows, baseline, model, cost = self._llm_nlq(question, run_id)
235
+ engine = "llm-sql"
236
+ intent = "nlq"
237
+
238
+ answer = baseline
239
+ # Grounded NL phrasing / explanation via the small reasoning model.
240
+ if use_llm and self._llm_available() and rows is not None and intent in ("summary", "reasons", "analytics"):
241
+ phrased, pmodel, pcost = self._llm_phrase(question, intent, sql, cols, rows, baseline, run_id)
242
+ if phrased:
243
+ answer, model, cost = phrased, pmodel or model, cost + pcost
244
+ engine = engine if engine == "llm-sql" else "deterministic+llm"
245
+
246
+ if not m and engine == "deterministic" and rows is None:
247
+ # nothing matched and no model — guide the user
248
+ answer = ("I can answer that best with one of these: " +
249
+ "; ".join(EXAMPLE_QUESTIONS[:5]) + ".")
250
+ intent = "help"
251
+
252
+ latency_ms = round((time.perf_counter() - t0) * 1000, 1)
253
+ result = {
254
+ "question": question, "intent": intent, "engine": engine,
255
+ "model": model or "deterministic", "sql": sql, "columns": cols,
256
+ "rows": (rows or [])[:50], "row_count": len(rows or []),
257
+ "answer": answer, "latency_ms": latency_ms, "cost_usd": round(cost, 6),
258
+ }
259
+ self._record(result, run_id)
260
+ return result
261
+
262
+ # --- internals ------------------------------------------------------------
263
+ def _llm_available(self) -> bool:
264
+ if not self.router:
265
+ return False
266
+ reg = self.router.registry
267
+ return any(getattr(reg, n, None) and getattr(reg, n).available()
268
+ for n in ("minicpm", "anthropic", "gemini", "local"))
269
+
270
+ def _llm_nlq(self, question: str, run_id: str):
271
+ from ..providers.base import CacheBlock, LLMRequest
272
+ sys_prompt = (
273
+ "You are a text-to-SQL assistant for a read-only SQLite ERP warehouse. "
274
+ "Given the schema and a question, return ONLY one SQLite SELECT query (no prose, "
275
+ "no markdown fences, no semicolon). Use only the tables/columns in the schema.\n\n"
276
+ + ERP_SCHEMA_DOC)
277
+ req = LLMRequest(
278
+ system_blocks=[CacheBlock(sys_prompt, cacheable=True)],
279
+ user_content=f"Question: {question}\nSQL:",
280
+ task="nlq", max_tokens=256, temperature=0.0)
281
+ resp = self.router.run(req, run_id)
282
+ sql = _clean_sql(resp.text)
283
+ cost = getattr(resp, "cost_usd", 0.0) or 0.0
284
+ try:
285
+ cols, rows = self.wh.query(sql)
286
+ baseline = f"Returned {len(rows)} row(s)." if rows else "No rows matched."
287
+ except Exception as e:
288
+ cols, rows, baseline = None, None, f"Generated SQL could not run safely: {e}"
289
+ return sql, cols, rows, baseline, resp.model, cost
290
+
291
+ def _llm_phrase(self, question, intent, sql, cols, rows, baseline, run_id):
292
+ from ..providers.base import CacheBlock, LLMRequest
293
+ verb = {"summary": "Write a concise executive summary",
294
+ "reasons": "Explain the most likely reason(s)",
295
+ "analytics": "Give a one-paragraph analytical readout"}.get(intent, "Answer")
296
+ sys_prompt = (
297
+ "You are an ERP financial analyst. Using ONLY the query result provided, answer the "
298
+ "user's question. Cite concrete figures from the rows; never invent numbers. Be brief "
299
+ "(2-4 sentences).")
300
+ table = _rows_to_text(cols, rows)
301
+ req = LLMRequest(
302
+ system_blocks=[CacheBlock(sys_prompt, cacheable=True)],
303
+ user_content=f"Question: {question}\n\nQuery result:\n{table}\n\nBaseline fact: {baseline}\n\n{verb}:",
304
+ task="summarize", max_tokens=300, temperature=0.2)
305
+ resp = self.router.run(req, run_id)
306
+ if resp.error or not resp.text.strip():
307
+ return None, None, 0.0
308
+ return resp.text.strip(), resp.model, (getattr(resp, "cost_usd", 0.0) or 0.0)
309
+
310
+ def _record(self, result: dict, run_id: str) -> None:
311
+ try:
312
+ log_event("info", "ERP chat", intent=result["intent"], engine=result["engine"],
313
+ model=result["model"], rows=result["row_count"], q=result["question"][:120])
314
+ except Exception:
315
+ pass
316
+ if self.db is not None:
317
+ try:
318
+ self.db.audit("erp_chat", run_id=run_id,
319
+ detail={"q": result["question"][:200], "intent": result["intent"],
320
+ "engine": result["engine"], "rows": result["row_count"]})
321
+ except Exception:
322
+ pass
323
+
324
+
325
+ def _clean_sql(text: str) -> str:
326
+ t = (text or "").strip()
327
+ t = re.sub(r"^```(?:sql)?", "", t, flags=re.IGNORECASE).strip()
328
+ t = re.sub(r"```$", "", t).strip()
329
+ # take the first statement only
330
+ t = t.split(";")[0].strip()
331
+ m = re.search(r"(select|with)\b.+", t, re.IGNORECASE | re.DOTALL)
332
+ return m.group(0).strip() if m else t
333
+
334
+
335
+ def _rows_to_text(cols, rows, limit: int = 25) -> str:
336
+ if not cols:
337
+ return "(no rows)"
338
+ lines = [" | ".join(map(str, cols))]
339
+ for r in (rows or [])[:limit]:
340
+ lines.append(" | ".join("" if v is None else str(v) for v in r))
341
+ return "\n".join(lines)
342
+
343
+
344
+ def answer_question(question: str, settings, router=None, warehouse=None, metrics=None,
345
+ db=None, use_llm: bool = True) -> dict:
346
+ return ErpChat(settings, router=router, warehouse=warehouse, metrics=metrics,
347
+ db=db).answer(question, use_llm=use_llm)
backend/app/erp/data.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simulated retail ERP data warehouse (SQLite) — the knowledgebase the ERP DocIQ
2
+ chatbot reasons over, and the source domain for the fine-tuning dataset.
3
+
4
+ Deterministic: a fixed RNG seed makes the whole warehouse reproducible, so NLQ
5
+ answers, analytics, evals and the fine-tune dataset are all stable across runs.
6
+
7
+ Schema (a retail accounts-payable / procurement slice):
8
+ vendors(vendor_id, name, region, category, payment_terms, on_time_rate, risk_tier)
9
+ products(sku, name, category, unit_cost, unit_price)
10
+ purchase_orders(po_id, vendor_id, order_date, status, region, amount)
11
+ po_lines(po_id, sku, qty, unit_price, line_total)
12
+ invoices(invoice_id, po_id, vendor_id, invoice_date, due_date, amount, tax,
13
+ total, status, paid_date, days_to_pay)
14
+ gl_entries(entry_id, invoice_id, account, cost_center, period, amount)
15
+ inventory(sku, region, on_hand, reorder_point, monthly_demand)
16
+ returns(return_id, sku, region, return_date, qty, reason, refund_amount)
17
+
18
+ This is intentionally a *small* but internally-consistent dataset: invoices roll up
19
+ from PO lines, GL entries roll up from invoices, returns reference real SKUs, so
20
+ analytics ("why did spend spike in Q2", "top vendors by late payments") are answerable
21
+ from the data rather than canned.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import random
26
+ import sqlite3
27
+ import threading
28
+ from datetime import date, timedelta
29
+ from pathlib import Path
30
+
31
+ SEED = 20260101
32
+
33
+ REGIONS = ["Northeast", "Midwest", "South", "West"]
34
+ CATEGORIES = ["Fixtures", "Electronics", "Apparel", "Grocery", "Packaging", "Logistics"]
35
+ RISK = ["low", "low", "low", "medium", "medium", "high"]
36
+
37
+ VENDOR_NAMES = [
38
+ "Meridian Industrial", "Nordic Fixture Works", "BrightLite Electronics", "Halcyon Build",
39
+ "Cascade Apparel Co", "Summit Packaging", "BlueRiver Logistics", "Orchard Grocery Supply",
40
+ "PrimeEdge Components", "Vertex Retail Systems", "Granite State Goods", "Copperline Textiles",
41
+ "Lakeside Distribution", "IronGate Hardware", "Pinnacle Foods", "Aurora Display Group",
42
+ ]
43
+
44
+ PRODUCTS = [
45
+ ("SKU-1001", "Heavy-gauge shelf unit", "Fixtures", 142.0, 189.0),
46
+ ("SKU-1002", "LED retail strip 2m", "Electronics", 14.5, 22.4),
47
+ ("SKU-1003", "Endcap display birch", "Fixtures", 232.0, 310.0),
48
+ ("SKU-1004", "Thermal receipt rolls", "Packaging", 1.1, 2.4),
49
+ ("SKU-1005", "Barcode scanner USB", "Electronics", 38.0, 59.0),
50
+ ("SKU-1006", "Store associate polo", "Apparel", 9.2, 18.0),
51
+ ("SKU-1007", "Pallet wrap roll", "Packaging", 18.0, 27.5),
52
+ ("SKU-1008", "Organic coffee 1kg", "Grocery", 8.5, 14.0),
53
+ ("SKU-1009", "Freight pallet move", "Logistics", 22.0, 35.0),
54
+ ("SKU-1010", "Security tag pack", "Electronics", 4.0, 7.5),
55
+ ("SKU-1011", "Checkout counter mat", "Fixtures", 26.0, 41.0),
56
+ ("SKU-1012", "Reusable tote bag", "Apparel", 2.3, 5.0),
57
+ ]
58
+
59
+ ACCOUNTS = {
60
+ "Fixtures": "5000-Store-Fit-Out", "Electronics": "5100-IT-Equipment",
61
+ "Apparel": "5200-Uniforms", "Grocery": "5300-COGS-Grocery",
62
+ "Packaging": "5400-Supplies", "Logistics": "5500-Freight",
63
+ }
64
+ RETURN_REASONS = ["damaged", "wrong item", "defective", "overstock", "late delivery"]
65
+
66
+
67
+ class ErpWarehouse:
68
+ """Read-mostly SQLite warehouse with a guarded NLQ query surface."""
69
+
70
+ def __init__(self, db_path: str | Path) -> None:
71
+ self.db_path = Path(db_path)
72
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
73
+ self._lock = threading.Lock()
74
+ self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
75
+ self._conn.row_factory = sqlite3.Row
76
+ if not self._has_data():
77
+ self._build()
78
+
79
+ def _has_data(self) -> bool:
80
+ try:
81
+ return self._conn.execute("SELECT 1 FROM invoices LIMIT 1").fetchone() is not None
82
+ except sqlite3.OperationalError:
83
+ return False
84
+
85
+ # --- schema + seed --------------------------------------------------------
86
+ def _build(self) -> None:
87
+ rng = random.Random(SEED)
88
+ with self._lock:
89
+ c = self._conn
90
+ c.executescript(
91
+ """
92
+ DROP TABLE IF EXISTS vendors; DROP TABLE IF EXISTS products;
93
+ DROP TABLE IF EXISTS purchase_orders; DROP TABLE IF EXISTS po_lines;
94
+ DROP TABLE IF EXISTS invoices; DROP TABLE IF EXISTS gl_entries;
95
+ DROP TABLE IF EXISTS inventory; DROP TABLE IF EXISTS returns;
96
+ CREATE TABLE vendors(vendor_id TEXT PRIMARY KEY, name TEXT, region TEXT,
97
+ category TEXT, payment_terms TEXT, on_time_rate REAL, risk_tier TEXT);
98
+ CREATE TABLE products(sku TEXT PRIMARY KEY, name TEXT, category TEXT,
99
+ unit_cost REAL, unit_price REAL);
100
+ CREATE TABLE purchase_orders(po_id TEXT PRIMARY KEY, vendor_id TEXT,
101
+ order_date TEXT, status TEXT, region TEXT, amount REAL);
102
+ CREATE TABLE po_lines(po_id TEXT, sku TEXT, qty INTEGER, unit_price REAL,
103
+ line_total REAL);
104
+ CREATE TABLE invoices(invoice_id TEXT PRIMARY KEY, po_id TEXT, vendor_id TEXT,
105
+ invoice_date TEXT, due_date TEXT, amount REAL, tax REAL, total REAL,
106
+ status TEXT, paid_date TEXT, days_to_pay INTEGER);
107
+ CREATE TABLE gl_entries(entry_id TEXT PRIMARY KEY, invoice_id TEXT, account TEXT,
108
+ cost_center TEXT, period TEXT, amount REAL);
109
+ CREATE TABLE inventory(sku TEXT, region TEXT, on_hand INTEGER,
110
+ reorder_point INTEGER, monthly_demand INTEGER);
111
+ CREATE TABLE returns(return_id TEXT, sku TEXT, region TEXT, return_date TEXT,
112
+ qty INTEGER, reason TEXT, refund_amount REAL);
113
+ """
114
+ )
115
+ # vendors
116
+ vendors = []
117
+ for i, nm in enumerate(VENDOR_NAMES):
118
+ cat = CATEGORIES[i % len(CATEGORIES)]
119
+ vid = f"V-{1000+i}"
120
+ terms = rng.choice(["Net 30", "Net 30", "Net 45", "Net 60"])
121
+ on_time = round(rng.uniform(0.72, 0.99), 3)
122
+ vendors.append((vid, nm, rng.choice(REGIONS), cat, terms, on_time, RISK[i % len(RISK)]))
123
+ c.executemany("INSERT INTO vendors VALUES (?,?,?,?,?,?,?)", vendors)
124
+ c.executemany("INSERT INTO products VALUES (?,?,?,?,?)", PRODUCTS)
125
+
126
+ prod_by_cat: dict[str, list] = {}
127
+ for p in PRODUCTS:
128
+ prod_by_cat.setdefault(p[2], []).append(p)
129
+
130
+ # 12 months of POs → invoices → GL. A deliberate Q2 spend spike on Fixtures
131
+ # (store-remodel program) makes "why did spend rise" answerable from data.
132
+ po_n = inv_n = gl_n = 0
133
+ start = date(2025, 7, 1)
134
+ for month in range(12):
135
+ m_date = (start + timedelta(days=30 * month))
136
+ period = m_date.strftime("%Y-%m")
137
+ # base order volume, with a Fixtures surge in 2026 Q2 (months 9-11)
138
+ n_orders = rng.randint(10, 16)
139
+ surge = month in (9, 10, 11)
140
+ for _ in range(n_orders):
141
+ v = rng.choice(vendors)
142
+ vid, vcat, vregion, terms, on_time = v[0], v[3], v[2], v[4], v[5]
143
+ # bias product to vendor category; surge picks Fixtures
144
+ cat = "Fixtures" if (surge and rng.random() < 0.45) else vcat
145
+ pool = prod_by_cat.get(cat) or PRODUCTS
146
+ po_n += 1
147
+ po_id = f"PO-{2000+po_n}"
148
+ od = m_date + timedelta(days=rng.randint(0, 27))
149
+ n_lines = rng.randint(1, 4)
150
+ amount = 0.0
151
+ lines = []
152
+ for _ in range(n_lines):
153
+ p = rng.choice(pool)
154
+ qty = rng.randint(2, 40) * (3 if (surge and cat == "Fixtures") else 1)
155
+ unit = round(p[4] * rng.uniform(0.95, 1.05), 2)
156
+ lt = round(qty * unit, 2)
157
+ amount += lt
158
+ lines.append((po_id, p[0], qty, unit, lt))
159
+ status = rng.choice(["received", "received", "received", "open", "cancelled"])
160
+ c.execute("INSERT INTO purchase_orders VALUES (?,?,?,?,?,?)",
161
+ (po_id, vid, od.isoformat(), status, vregion, round(amount, 2)))
162
+ c.executemany("INSERT INTO po_lines VALUES (?,?,?,?,?)", lines)
163
+ if status == "cancelled":
164
+ continue
165
+ # invoice
166
+ inv_n += 1
167
+ inv_id = f"INV-{5000+inv_n}"
168
+ idate = od + timedelta(days=rng.randint(1, 10))
169
+ term_days = int(terms.split()[1])
170
+ due = idate + timedelta(days=term_days)
171
+ tax = round(amount * 0.0825, 2)
172
+ total = round(amount + tax, 2)
173
+ paid = rng.random() < 0.82
174
+ if paid:
175
+ # late if vendor has low on-time rate
176
+ late = rng.random() > on_time
177
+ dd = rng.randint(term_days + 3, term_days + 25) if late else rng.randint(8, term_days)
178
+ paid_date = (idate + timedelta(days=dd)).isoformat()
179
+ istatus = "paid"
180
+ days_to_pay = dd
181
+ else:
182
+ paid_date, istatus, days_to_pay = None, "open", None
183
+ c.execute("INSERT INTO invoices VALUES (?,?,?,?,?,?,?,?,?,?,?)",
184
+ (inv_id, po_id, vid, idate.isoformat(), due.isoformat(),
185
+ round(amount, 2), tax, total, istatus, paid_date, days_to_pay))
186
+ gl_n += 1
187
+ c.execute("INSERT INTO gl_entries VALUES (?,?,?,?,?,?)",
188
+ (f"GL-{9000+gl_n}", inv_id, ACCOUNTS.get(cat, "5900-Other"),
189
+ f"CC-{vregion[:3].upper()}", period, total))
190
+ # inventory + returns
191
+ for p in PRODUCTS:
192
+ for r in REGIONS:
193
+ dem = rng.randint(20, 200)
194
+ c.execute("INSERT INTO inventory VALUES (?,?,?,?,?)",
195
+ (p[0], r, rng.randint(0, 400), int(dem * 0.5), dem))
196
+ ret_n = 0
197
+ for _ in range(60):
198
+ p = rng.choice(PRODUCTS)
199
+ ret_n += 1
200
+ rdate = (start + timedelta(days=rng.randint(0, 360)))
201
+ qty = rng.randint(1, 12)
202
+ c.execute("INSERT INTO returns VALUES (?,?,?,?,?,?,?)",
203
+ (f"R-{7000+ret_n}", p[0], rng.choice(REGIONS), rdate.isoformat(),
204
+ qty, rng.choice(RETURN_REASONS), round(qty * p[4], 2)))
205
+ c.commit()
206
+
207
+ # --- guarded query surface (for NLQ) --------------------------------------
208
+ def query(self, sql: str, limit: int = 200) -> tuple[list[str], list[list]]:
209
+ """Execute a single read-only SELECT. Raises ValueError on anything unsafe."""
210
+ safe = sql.strip().rstrip(";").strip()
211
+ low = safe.lower()
212
+ if not low.startswith(("select", "with")):
213
+ raise ValueError("only SELECT/WITH queries are allowed")
214
+ forbidden = (" insert ", " update ", " delete ", " drop ", " alter ", " create ",
215
+ " attach ", " pragma ", " replace ", "--", ";")
216
+ padded = f" {low} "
217
+ for f in forbidden:
218
+ if f in padded:
219
+ raise ValueError(f"forbidden token in query: {f.strip()!r}")
220
+ if " limit " not in low:
221
+ safe = f"{safe} LIMIT {limit}"
222
+ with self._lock:
223
+ cur = self._conn.execute(safe)
224
+ rows = cur.fetchall()
225
+ cols = [d[0] for d in cur.description]
226
+ return cols, [list(r) for r in rows]
227
+
228
+ def scalar(self, sql: str):
229
+ cols, rows = self.query(sql, limit=1)
230
+ return rows[0][0] if rows else None
231
+
232
+ def table_counts(self) -> dict:
233
+ out = {}
234
+ for t in ("vendors", "products", "purchase_orders", "po_lines", "invoices",
235
+ "gl_entries", "inventory", "returns"):
236
+ out[t] = self.scalar(f"SELECT COUNT(*) FROM {t}")
237
+ return out
238
+
239
+
240
+ # Compact schema description handed to the NLQ model (kept byte-stable for caching).
241
+ ERP_SCHEMA_DOC = """ERP warehouse schema (SQLite, retail procurement / AP):
242
+ - vendors(vendor_id, name, region, category, payment_terms, on_time_rate, risk_tier)
243
+ - products(sku, name, category, unit_cost, unit_price)
244
+ - purchase_orders(po_id, vendor_id, order_date, status, region, amount)
245
+ - po_lines(po_id, sku, qty, unit_price, line_total)
246
+ - invoices(invoice_id, po_id, vendor_id, invoice_date, due_date, amount, tax, total, status, paid_date, days_to_pay)
247
+ - gl_entries(entry_id, invoice_id, account, cost_center, period, amount) -- period is 'YYYY-MM'
248
+ - inventory(sku, region, on_hand, reorder_point, monthly_demand)
249
+ - returns(return_id, sku, region, return_date, qty, reason, refund_amount)
250
+ Notes: invoices.status in ('paid','open'); a payment is LATE when days_to_pay > payment_terms days.
251
+ Spend = invoices.total. Dates are ISO 'YYYY-MM-DD'. gl_entries.period groups spend by month."""
252
+
253
+ EXAMPLE_QUESTIONS = [
254
+ "What was total invoiced spend by month?",
255
+ "Who are the top 5 vendors by spend?",
256
+ "Which vendors paid late most often?",
257
+ "Why did spend rise in Q2 2026?",
258
+ "What is the late-payment rate overall?",
259
+ "Show spend by category.",
260
+ "Summarize accounts payable health.",
261
+ "Which SKUs are below reorder point?",
262
+ "What is the total value of open (unpaid) invoices?",
263
+ "Top return reasons by refund amount?",
264
+ ]
265
+
266
+ _WAREHOUSE: ErpWarehouse | None = None
267
+
268
+
269
+ def get_warehouse(settings) -> ErpWarehouse:
270
+ """Process-wide singleton, seeded under the writable dir."""
271
+ global _WAREHOUSE
272
+ if _WAREHOUSE is None:
273
+ path = getattr(settings, "erp_db_path", None) or (settings.writable_dir / "erp.db")
274
+ _WAREHOUSE = ErpWarehouse(path)
275
+ return _WAREHOUSE
backend/app/erp/finetune.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ERP-domain fine-tuning: dataset + an offline domain-adaptation trainer.
2
+
3
+ Two honest paths share ONE dataset, built from the simulated ERP knowledgebase:
4
+
5
+ • PRODUCTION (GPU): `scripts/finetune_erp.py --backend hf` LoRA-fine-tunes the
6
+ OpenBMB **MiniCPM3-4B** text model (PEFT + TRL SFTTrainer) on the JSONL below.
7
+ That is the "fine-tune a small model from the list" deliverable.
8
+
9
+ • OFFLINE DEMO (CPU, runs anywhere — no torch/GPU): `--backend local` trains a
10
+ compact ERP **NLQ-routing head** (multinomial softmax over hashed n-gram
11
+ features, pure numpy) on the SAME examples, with a real train/test split, a
12
+ real training-loss curve, and a real BEFORE→AFTER accuracy gain. This is the
13
+ small model's domain-adaptation layer — it demonstrates the training loop +
14
+ eval methodology end-to-end so the story is complete without a GPU.
15
+
16
+ Both report into `erp_finetune_report.json` (served at /api/erp/finetune-report).
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ import json
22
+ import math
23
+ import random
24
+ import time
25
+ from pathlib import Path
26
+
27
+ import numpy as np
28
+
29
+ from .data import get_warehouse
30
+
31
+ # ── canonical ERP NLQ templates (label space) + rich paraphrases ──────────────
32
+ # Each template is one SQL "skill" the model must learn to route to from varied
33
+ # natural phrasings. Held-out paraphrases test generalization, not memorization.
34
+ TEMPLATES = [
35
+ {"id": "spend_by_month", "intent": "analytics",
36
+ "sql": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period",
37
+ "paraphrases": [
38
+ "What was total invoiced spend by month?", "Show monthly spend.",
39
+ "Break spend down per month.", "How much did we invoice each month?",
40
+ "Monthly invoiced spend trend?", "Spend by period please.",
41
+ "Give me the month-by-month spend.", "Total spend grouped by month.",
42
+ "What's our spend over the months?", "Plot spend per month.",
43
+ "Monthly AP spend totals?", "How has spend trended month to month?"]},
44
+ {"id": "top_vendors", "intent": "analytics",
45
+ "sql": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5",
46
+ "paraphrases": [
47
+ "Who are the top 5 vendors by spend?", "Which vendors do we spend the most with?",
48
+ "List our biggest suppliers.", "Top vendors by total spend?",
49
+ "Rank vendors by spend.", "Which suppliers cost us the most?",
50
+ "Show the five largest vendors.", "Biggest vendors by invoice value?",
51
+ "Our highest-spend vendors?", "Top suppliers ranked by spend.",
52
+ "Which vendors get the most of our money?", "Largest vendors please."]},
53
+ {"id": "late_vendors", "intent": "analytics",
54
+ "sql": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5",
55
+ "paraphrases": [
56
+ "Which vendors paid late most often?", "Who are our worst late-paying vendors?",
57
+ "Vendors with the most overdue payments?", "Which suppliers do we pay late?",
58
+ "Show vendors with frequent late payments.", "Worst offenders for late payment?",
59
+ "Which vendors are habitually overdue?", "List vendors by late-payment count.",
60
+ "Who keeps getting paid past terms?", "Late payers among our vendors?",
61
+ "Vendors most often paid after due date?", "Which suppliers have payment delays?"]},
62
+ {"id": "late_rate", "intent": "analytics",
63
+ "sql": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'",
64
+ "paraphrases": [
65
+ "What is the late-payment rate overall?", "What percent of invoices are paid late?",
66
+ "Our overall late payment percentage?", "How often do we pay late, as a rate?",
67
+ "Share of late payments?", "What fraction of payments miss terms?",
68
+ "Late-payment ratio across all invoices?", "Overall on-time vs late rate?",
69
+ "What's our late payment rate?", "Percentage of overdue payments overall?",
70
+ "How bad is our late-payment rate?", "Give the global late payment percentage."]},
71
+ {"id": "spend_by_category", "intent": "analytics",
72
+ "sql": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC",
73
+ "paraphrases": [
74
+ "Show spend by category.", "How much do we spend per product category?",
75
+ "Category-level spend breakdown?", "Spend grouped by category.",
76
+ "Which categories cost the most?", "Break down spend across categories.",
77
+ "Spend per category please.", "What's our category spend mix?",
78
+ "Total spend for each category?", "Categories ranked by spend.",
79
+ "Where does spend go by category?", "Category spend totals?"]},
80
+ {"id": "why_q2", "intent": "reasons",
81
+ "sql": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC",
82
+ "paraphrases": [
83
+ "Why did spend rise in Q2 2026?", "What drove the Q2 spend increase?",
84
+ "Explain the spend spike in Q2.", "Reason for higher spending in Q2 2026?",
85
+ "Why was Q2 so expensive?", "What caused the second-quarter cost jump?",
86
+ "Account for the Q2 2026 spend surge.", "Why is Q2 spend up?",
87
+ "What's behind the Q2 increase?", "Drivers of the Q2 spend rise?",
88
+ "Why did costs climb in Q2 2026?", "Explain why Q2 spend went up."]},
89
+ {"id": "below_reorder", "intent": "analytics",
90
+ "sql": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15",
91
+ "paraphrases": [
92
+ "Which SKUs are below reorder point?", "What needs replenishing?",
93
+ "Show items under their reorder level.", "Which products are low on stock?",
94
+ "List SKUs below reorder threshold.", "What should we reorder?",
95
+ "Inventory below reorder point?", "Which items risk stockout?",
96
+ "Stock positions under reorder point?", "What's running low in inventory?",
97
+ "SKUs needing replenishment?", "Which products fell below reorder?"]},
98
+ {"id": "open_invoices", "intent": "analytics",
99
+ "sql": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'",
100
+ "paraphrases": [
101
+ "What is the total value of open invoices?", "How much do we owe in unpaid invoices?",
102
+ "Outstanding invoice value?", "Total open AP balance?",
103
+ "Value of unpaid invoices?", "How much is still open in payables?",
104
+ "Sum of open invoices?", "What's our outstanding payables total?",
105
+ "Unpaid invoice amount overall?", "Open invoice liability?",
106
+ "How much AP is still open?", "Total of invoices not yet paid?"]},
107
+ {"id": "return_reasons", "intent": "analytics",
108
+ "sql": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC",
109
+ "paraphrases": [
110
+ "Top return reasons by refund amount?", "Why are products being returned?",
111
+ "Biggest return reasons by refund value?", "Break down returns by reason.",
112
+ "Which return reasons cost the most?", "Return reasons ranked by refunds?",
113
+ "What drives our refunds?", "Show returns grouped by reason.",
114
+ "Most costly return reasons?", "Refund totals per return reason?",
115
+ "What are the leading causes of returns?", "Return reason breakdown by money?"]},
116
+ {"id": "ap_health", "intent": "summary",
117
+ "sql": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay",
118
+ "paraphrases": [
119
+ "Summarize accounts payable health.", "Give me an AP health overview.",
120
+ "How healthy are our payables?", "Overall accounts payable summary?",
121
+ "Summarize our AP position.", "What's the state of accounts payable?",
122
+ "AP health check please.", "Overview of payables health?",
123
+ "How are we doing on accounts payable?", "Summarize payables status.",
124
+ "Give an executive AP summary.", "State of our AP overall?"]},
125
+ ]
126
+ LABELS = [t["id"] for t in TEMPLATES]
127
+ SQL_BY_ID = {t["id"]: t["sql"] for t in TEMPLATES}
128
+ INTENT_BY_ID = {t["id"]: t["intent"] for t in TEMPLATES}
129
+
130
+
131
+ def build_dataset(seed: int = 7) -> list[dict]:
132
+ """Flatten templates+paraphrases into instruction-tuning examples (the JSONL)."""
133
+ rng = random.Random(seed)
134
+ rows = []
135
+ for t in TEMPLATES:
136
+ for q in t["paraphrases"]:
137
+ rows.append({
138
+ "task": "nlq", "intent": t["intent"], "template": t["id"],
139
+ "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.",
140
+ "input": q, "output": t["sql"],
141
+ })
142
+ rng.shuffle(rows)
143
+ return rows
144
+
145
+
146
+ # ── offline domain-adaptation trainer (pure numpy) ────────────────────────────
147
+ def _hash_ngrams(text: str, dim: int = 4096) -> np.ndarray:
148
+ """Hashing-trick feature vector: word unigrams/bigrams + char 3-grams."""
149
+ text = (text or "").lower()
150
+ toks = []
151
+ words = [w for w in "".join(c if c.isalnum() else " " for c in text).split() if w]
152
+ toks += words
153
+ toks += [f"{words[i]}_{words[i+1]}" for i in range(len(words) - 1)]
154
+ s = f" {text} "
155
+ toks += [s[i:i+3] for i in range(len(s) - 2)]
156
+ v = np.zeros(dim, dtype=np.float32)
157
+ for tk in toks:
158
+ h = int(hashlib.md5(tk.encode()).hexdigest(), 16)
159
+ v[h % dim] += 1.0
160
+ if h % 2: # signed hashing reduces collisions
161
+ v[(h >> 1) % dim] -= 0.5
162
+ n = np.linalg.norm(v)
163
+ return v / n if n else v
164
+
165
+
166
+ class ErpNlqRouter:
167
+ """Multinomial softmax classifier (numpy) — the ERP NLQ routing head."""
168
+
169
+ def __init__(self, dim: int = 4096, n_classes: int = len(LABELS), seed: int = 0) -> None:
170
+ self.dim, self.K = dim, n_classes
171
+ # small random init ⇒ an untrained head predicts ~uniformly (chance baseline),
172
+ # an honest "before" reference rather than a degenerate always-class-0.
173
+ rng = np.random.default_rng(seed)
174
+ self.W = (rng.normal(0, 0.01, (dim, n_classes))).astype(np.float32)
175
+ self.b = np.zeros(n_classes, dtype=np.float32)
176
+
177
+ @property
178
+ def n_params(self) -> int:
179
+ return int(self.W.size + self.b.size)
180
+
181
+ def _logits(self, X):
182
+ return X @ self.W + self.b
183
+
184
+ @staticmethod
185
+ def _softmax(z):
186
+ z = z - z.max(axis=1, keepdims=True)
187
+ e = np.exp(z)
188
+ return e / e.sum(axis=1, keepdims=True)
189
+
190
+ def fit(self, X, y, epochs=250, lr=0.5, l2=1e-4, seed=0):
191
+ rng = np.random.default_rng(seed)
192
+ n = X.shape[0]
193
+ Y = np.eye(self.K, dtype=np.float32)[y]
194
+ losses = []
195
+ for ep in range(epochs):
196
+ idx = rng.permutation(n)
197
+ P = self._softmax(self._logits(X[idx]))
198
+ loss = -np.mean(np.sum(Y[idx] * np.log(P + 1e-9), axis=1)) + l2 * np.sum(self.W ** 2)
199
+ g = (P - Y[idx]) / n
200
+ self.W -= lr * (X[idx].T @ g + 2 * l2 * self.W)
201
+ self.b -= lr * g.sum(axis=0)
202
+ losses.append(round(float(loss), 4))
203
+ return losses
204
+
205
+ def predict(self, X):
206
+ return self._logits(X).argmax(axis=1)
207
+
208
+
209
+ def _featurize(texts, dim=4096):
210
+ return np.vstack([_hash_ngrams(t, dim) for t in texts])
211
+
212
+
213
+ def run_offline_finetune(settings, seed: int = 7, epochs: int = 400) -> dict:
214
+ """Train the ERP NLQ router on the dataset; report BEFORE→AFTER + loss curve."""
215
+ data = build_dataset(seed)
216
+ X = _featurize([d["input"] for d in data])
217
+ y = np.array([LABELS.index(d["template"]) for d in data])
218
+
219
+ # split paraphrases so the test set is unseen phrasings of seen skills
220
+ rng = np.random.default_rng(seed)
221
+ perm = rng.permutation(len(data))
222
+ n_test = max(len(data) // 5, len(LABELS))
223
+ te, tr = perm[:n_test], perm[n_test:]
224
+
225
+ model = ErpNlqRouter(dim=X.shape[1])
226
+ # BEFORE: untrained head (random/zero init) — chance-level baseline
227
+ before_acc = float((model.predict(X[te]) == y[te]).mean())
228
+ losses = model.fit(X[tr], y[tr], epochs=epochs, seed=seed)
229
+ after_pred = model.predict(X[te])
230
+ after_acc = float((after_pred == y[te]).mean())
231
+
232
+ # end-to-end: does the SQL the router selected actually run against the warehouse?
233
+ wh = get_warehouse(settings)
234
+ exec_ok = 0
235
+ for pred in after_pred:
236
+ try:
237
+ wh.query(SQL_BY_ID[LABELS[pred]])
238
+ exec_ok += 1
239
+ except Exception:
240
+ pass
241
+ exec_rate = round(exec_ok / len(te), 3)
242
+
243
+ return {
244
+ "kind": "offline-domain-adaptation",
245
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
246
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the "
247
+ "MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits "
248
+ "in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
249
+ "dataset_size": len(data), "train": int(len(tr)), "test": int(len(te)),
250
+ "n_classes": len(LABELS), "trainable_params": model.n_params,
251
+ "epochs": epochs,
252
+ "before_test_accuracy": round(before_acc, 3),
253
+ "after_test_accuracy": round(after_acc, 3),
254
+ "accuracy_gain": round(after_acc - before_acc, 3),
255
+ "routed_sql_exec_rate": exec_rate,
256
+ "loss_curve": losses[:: max(1, epochs // 40)],
257
+ "final_loss": losses[-1] if losses else None,
258
+ "labels": LABELS,
259
+ }
backend/app/extraction_heuristics.py CHANGED
@@ -124,12 +124,17 @@ def classify(text: str) -> tuple[str, float]:
124
  "invoice": sum(k in t for k in ["invoice", "bill to", "amount due", "tax", "subtotal"]),
125
  "purchase_order": sum(k in t for k in ["purchase order", "p.o.", "po number", "ship to", "buyer"]),
126
  "contract": sum(k in t for k in ["agreement", "party", "governing law", "term", "whereas", "hereby"]),
127
- "receipt": sum(k in t for k in ["receipt", "merchant", "change", "cash", "card ending"]),
 
128
  "subscription_memo": sum(k in t for k in ["subscription", "renewal", "billing cycle", "auto-renew", "plan"]),
129
  }
130
  # Strong signals override counts.
131
  if "purchase order" in t or re.search(r"\bp\.?o\.?\s*(number|#|no)", t):
132
  return "purchase_order", 0.95
 
 
 
 
133
  if "invoice" in t:
134
  scores["invoice"] += 2
135
  best = max(scores, key=scores.get)
 
124
  "invoice": sum(k in t for k in ["invoice", "bill to", "amount due", "tax", "subtotal"]),
125
  "purchase_order": sum(k in t for k in ["purchase order", "p.o.", "po number", "ship to", "buyer"]),
126
  "contract": sum(k in t for k in ["agreement", "party", "governing law", "term", "whereas", "hereby"]),
127
+ "receipt": sum(k in t for k in ["receipt", "merchant", "change", "cash", "card ending",
128
+ "register", "payment:", "thank you"]),
129
  "subscription_memo": sum(k in t for k in ["subscription", "renewal", "billing cycle", "auto-renew", "plan"]),
130
  }
131
  # Strong signals override counts.
132
  if "purchase order" in t or re.search(r"\bp\.?o\.?\s*(number|#|no)", t):
133
  return "purchase_order", 0.95
134
+ # A document that says "receipt" but never "invoice" is a receipt — totals/tax
135
+ # lines alone must not tip it to invoice (real invoices say "Invoice").
136
+ if "receipt" in t and "invoice" not in t:
137
+ return "receipt", 0.9
138
  if "invoice" in t:
139
  scores["invoice"] += 2
140
  best = max(scores, key=scores.get)
backend/app/main.py CHANGED
@@ -94,6 +94,11 @@ class PromptUpdate(BaseModel):
94
  content: str
95
 
96
 
 
 
 
 
 
97
  # --- helpers ------------------------------------------------------------------
98
  def require_admin(request: Request) -> bool:
99
  if not _check(request.headers.get("authorization"), settings.admin_user, settings.admin_pass):
@@ -162,6 +167,16 @@ def capabilities():
162
  caps["ocr"] = {**caps.get("ocr", {}), "registry": ocr_registry.info()}
163
  caps["categories"] = list_categories()
164
  caps["mode"] = settings.mode
 
 
 
 
 
 
 
 
 
 
165
  return caps
166
 
167
 
@@ -190,6 +205,83 @@ def ocr_test_report(refresh: bool = False):
190
  return _ocr_report_cache["report"]
191
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  @app.get("/api/samples")
194
  def samples():
195
  return {"samples": _list_samples()}
 
94
  content: str
95
 
96
 
97
+ class ErpChatRequest(BaseModel):
98
+ question: str
99
+ use_llm: bool = True
100
+
101
+
102
  # --- helpers ------------------------------------------------------------------
103
  def require_admin(request: Request) -> bool:
104
  if not _check(request.headers.get("authorization"), settings.admin_user, settings.admin_pass):
 
167
  caps["ocr"] = {**caps.get("ocr", {}), "registry": ocr_registry.info()}
168
  caps["categories"] = list_categories()
169
  caps["mode"] = settings.mode
170
+ from .models_registry import model_catalog
171
+ mc = model_catalog(settings)
172
+ caps["models"] = {"max_params_b": mc["max_params_b"], "count": mc["count"],
173
+ "available": mc["available"], "labs": [l["lab"] for l in mc["labs"]],
174
+ "reasoning_capable": mc.get("reasoning_capable", [])}
175
+ try:
176
+ from .erp import get_warehouse
177
+ caps["erp"] = {"enabled": True, "tables": get_warehouse(settings).table_counts()}
178
+ except Exception as e: # never let ERP wiring break capabilities
179
+ caps["erp"] = {"enabled": False, "error": str(e)}
180
  return caps
181
 
182
 
 
205
  return _ocr_report_cache["report"]
206
 
207
 
208
+ @app.get("/api/models")
209
+ def models():
210
+ """Enabled small models (≤32B) from OpenBMB, Cohere, Black Forest Labs."""
211
+ from .models_registry import model_catalog
212
+ return model_catalog(settings)
213
+
214
+
215
+ @app.get("/api/ocr/quality-report")
216
+ def ocr_quality_report(refresh: bool = False, request: Request = None):
217
+ """OCR output-quality (CER/WER) + document-analysis (field accuracy) per backend.
218
+ Serves the committed/published report; ?refresh=1 re-runs it (admin only)."""
219
+ import json as _json
220
+ if refresh:
221
+ require_admin(request)
222
+ from .ocr.quality import run_ocr_quality
223
+ rep = run_ocr_quality(settings, ocr_registry, router, metrics, db=db, rag_store=rag_store)
224
+ (settings.writable_dir / "ocr_quality_report.json").write_text(_json.dumps(rep))
225
+ db.audit("ocr_quality_published", actor=_actor(request),
226
+ detail={"best_ocr": rep["best_ocr_quality"]})
227
+ return rep
228
+ for p in (settings.writable_dir / "ocr_quality_report.json",
229
+ settings.eval_report_committed.parent / "ocr_quality_report.json"):
230
+ if p.exists():
231
+ return _json.loads(p.read_text())
232
+ return JSONResponse({"available": False,
233
+ "message": "run `python scripts/ocr_quality.py`"}, status_code=200)
234
+
235
+
236
+ # --- ERP DocIQ (NLQ / analytics / summary / reasons over the ERP knowledgebase) ---
237
+ @app.get("/api/erp/schema")
238
+ def erp_schema():
239
+ from .erp import get_warehouse
240
+ from .erp.data import ERP_SCHEMA_DOC, EXAMPLE_QUESTIONS
241
+ wh = get_warehouse(settings)
242
+ return {"schema_doc": ERP_SCHEMA_DOC, "tables": wh.table_counts(),
243
+ "examples": EXAMPLE_QUESTIONS}
244
+
245
+
246
+ @app.get("/api/erp/reports")
247
+ def erp_reports():
248
+ """A few canned ERP reports (real data) the chatbot can summarize/explain."""
249
+ from .erp.chat import (_q_spend_by_month, _q_spend_by_category, _q_top_vendors,
250
+ _q_late_vendors, _q_return_reasons)
251
+ from .erp import get_warehouse
252
+ wh = get_warehouse(settings)
253
+ out = {}
254
+ for name, fn in [("spend_by_month", _q_spend_by_month), ("spend_by_category", _q_spend_by_category),
255
+ ("top_vendors", _q_top_vendors), ("late_vendors", _q_late_vendors),
256
+ ("return_reasons", _q_return_reasons)]:
257
+ sql, cols, rows, ans = fn(wh)
258
+ out[name] = {"columns": cols, "rows": rows, "headline": ans, "sql": sql}
259
+ return out
260
+
261
+
262
+ @app.post("/api/erp/chat")
263
+ def erp_chat(req: ErpChatRequest, request: Request = None):
264
+ """Ask the ERP DocIQ chatbot: NLQ→SQL, analytics, summary, or 'why' reasoning."""
265
+ from .erp.chat import ErpChat
266
+ from .erp import get_warehouse
267
+ chat = ErpChat(settings, router=router, warehouse=get_warehouse(settings),
268
+ metrics=metrics, db=db)
269
+ return chat.answer(req.question, use_llm=req.use_llm, run_id=f"erp-{_actor(request)}")
270
+
271
+
272
+ @app.get("/api/erp/finetune-report")
273
+ def erp_finetune_report():
274
+ """Latest fine-tune run (offline domain-adaptation demo + MiniCPM LoRA recipe)."""
275
+ import json as _json
276
+ from .config import BACKEND_DIR as _BD
277
+ for p in (settings.writable_dir / "erp_finetune_report.json",
278
+ _BD / "finetune" / "erp_finetune_report.json"):
279
+ if p.exists():
280
+ return _json.loads(p.read_text())
281
+ return JSONResponse({"available": False,
282
+ "message": "run `python scripts/finetune_erp.py`"}, status_code=200)
283
+
284
+
285
  @app.get("/api/samples")
286
  def samples():
287
  return {"samples": _list_samples()}
backend/app/models_registry.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Enabled model catalogue — small models (≤32B params) from three labs.
2
+
3
+ The "Build Small" constraint is ≤32B parameters; every model below also fits in ≤32 GB
4
+ of memory at a sensible quantization. Availability is computed from config/deps so the
5
+ UI can show which are actually live.
6
+
7
+ • OpenBMB (MiniCPM) — vision-language OCR + text reasoning (the OCR/IDP engine)
8
+ • Cohere (Aya-Vision) — vision-language OCR / VQA
9
+ • Black Forest Labs (FLUX) — image GENERATION → synthetic test documents (not an OCR
10
+ model; used to stress-test OCR quality)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import importlib.util
15
+
16
+ MAX_PARAMS_B = 32 # hackathon "small models" guardrail
17
+
18
+
19
+ def _has(mod: str) -> bool:
20
+ return importlib.util.find_spec(mod) is not None
21
+
22
+
23
+ def model_catalog(settings) -> dict:
24
+ minicpm_api = bool(settings.minicpm_base_url)
25
+ transformers = _has("transformers")
26
+ cohere_enabled = transformers and __import__("os").getenv("COHERE_OCR_ENABLE", "").lower() in {"1", "true", "yes"}
27
+ cohere_api = bool(getattr(settings, "cohere_api_key", None))
28
+ diffusers = _has("diffusers")
29
+ bfl_api = bool(settings.bfl_api_key)
30
+
31
+ labs = [
32
+ {
33
+ "lab": "OpenBMB", "homepage": "https://github.com/OpenBMB/MiniCPM",
34
+ "models": [
35
+ {"name": "MiniCPM-V-4.6", "id": settings.minicpm_model, "params_b": 8.0,
36
+ "size_gb_int4": 5.5, "modality": "vision-language (OCR + reasoning)",
37
+ "role": "OCR backend + LLM extractor", "license": "Apache-2.0 (weights: MiniCPM Model License)",
38
+ "available": minicpm_api or transformers,
39
+ "enable": "MINICPM_BASE_URL (+ MINICPM_API_KEY) OR pip install transformers"},
40
+ {"name": "MiniCPM-o-4.5", "id": "MiniCPM-o-4.5", "params_b": 8.0,
41
+ "size_gb_int4": 5.5, "modality": "omni (vision/audio) VLM",
42
+ "role": "alt OCR/VLM", "license": "MiniCPM Model License",
43
+ "available": minicpm_api, "enable": "same OpenAI-compatible endpoint"},
44
+ {"name": "MiniCPM3-4B", "id": settings.openbmb_reasoner_model, "params_b": 4.0,
45
+ "size_gb_int4": 2.8, "modality": "text LLM (reasoning + function-calling, 32k ctx)",
46
+ "role": "ERP reasoning · NLQ→SQL · report summarization (fine-tune target)",
47
+ "license": "Apache-2.0 (weights: MiniCPM Model License)",
48
+ "available": minicpm_api or transformers,
49
+ "enable": "OpenAI-compatible endpoint (OPENBMB_REASONER_MODEL) OR pip install transformers"},
50
+ ],
51
+ },
52
+ {
53
+ "lab": "Cohere", "homepage": "https://huggingface.co/CohereLabs",
54
+ "models": [
55
+ {"name": "Aya-Vision-8B", "id": settings.cohere_ocr_model, "params_b": 8.0,
56
+ "size_gb_int4": 6.0, "modality": "vision-language (OCR/VQA, 23 langs)",
57
+ "role": "OCR backend", "license": "CC-BY-NC 4.0",
58
+ "available": cohere_enabled,
59
+ "enable": "pip install transformers torch + COHERE_OCR_ENABLE=true"},
60
+ {"name": "Aya-Vision-32B", "id": "CohereLabs/aya-vision-32b", "params_b": 32.0,
61
+ "size_gb_int4": 18.0, "modality": "vision-language (OCR/VQA)",
62
+ "role": "alt OCR backend (max-quality small)", "license": "CC-BY-NC 4.0",
63
+ "available": cohere_enabled,
64
+ "enable": "COHERE_OCR_MODEL=CohereLabs/aya-vision-32b + COHERE_OCR_ENABLE=true"},
65
+ {"name": "Command R7B", "id": "command-r7b-12-2024", "params_b": 7.0,
66
+ "size_gb_int4": 5.0, "modality": "text LLM (RAG + tool-use + reasoning, 128k ctx)",
67
+ "role": "ERP RAG · NLQ · grounded reasoning", "license": "CC-BY-NC 4.0",
68
+ "available": cohere_api,
69
+ "enable": "COHERE_API_KEY (api.cohere.com) OR weights CohereLabs/c4ai-command-r7b-12-2024"},
70
+ ],
71
+ },
72
+ {
73
+ "lab": "Black Forest Labs", "homepage": "https://github.com/black-forest-labs/flux",
74
+ "models": [
75
+ {"name": "FLUX.1 [dev]", "id": settings.bfl_model, "params_b": 12.0,
76
+ "size_gb_int4": 12.0, "modality": "text-to-image GENERATION",
77
+ "role": "synthetic test-document generator (not OCR)", "license": "FLUX.1-dev Non-Commercial",
78
+ "available": bfl_api or diffusers,
79
+ "enable": "BFL_API_KEY (api.bfl.ml) OR pip install diffusers torch"},
80
+ {"name": "FLUX.1 [schnell]", "id": "flux-schnell", "params_b": 12.0,
81
+ "size_gb_int4": 12.0, "modality": "text-to-image GENERATION (fast)",
82
+ "role": "synthetic test-document generator", "license": "Apache-2.0",
83
+ "available": bfl_api or diffusers, "enable": "BFL_API_KEY OR pip install diffusers"},
84
+ ],
85
+ },
86
+ ]
87
+
88
+ # guardrail: nothing exceeds the small-model size limit
89
+ for lab in labs:
90
+ for m in lab["models"]:
91
+ assert m["params_b"] <= MAX_PARAMS_B, f"{m['name']} exceeds {MAX_PARAMS_B}B"
92
+
93
+ flat = [{"lab": lab["lab"], **m} for lab in labs for m in lab["models"]]
94
+ return {
95
+ "max_params_b": MAX_PARAMS_B,
96
+ "labs": labs,
97
+ "available": [m["name"] for m in flat if m["available"]],
98
+ "ocr_capable": [m["name"] for m in flat if "OCR" in m["modality"] or "vision" in m["modality"]],
99
+ "reasoning_capable": [m["name"] for m in flat
100
+ if "reasoning" in m["modality"] or "text LLM" in m["modality"]],
101
+ "count": len(flat),
102
+ }
backend/app/ocr/quality.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OCR output-quality + document-analysis benchmark.
2
+
3
+ For each available OCR backend, over a set of scanned samples with ground truth, we
4
+ measure two quality dimensions and capture logs + metrics:
5
+
6
+ 1. OCR text quality — Character Error Rate (CER) and Word Error Rate (WER) of the
7
+ transcribed text vs a reference (the `.txt` sidecar that ships with each scan).
8
+ 2. Document-analysis quality — field exact-match and F1 of the FULL pipeline
9
+ (OCR → classify → extract → validate) vs the document's ground-truth JSON.
10
+
11
+ Plus latency, cost, model name/size. Results are published (file + endpoint + optional
12
+ HF upload). Pure-Python edit distance, no extra deps.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import re
18
+ import time
19
+ from pathlib import Path
20
+
21
+ from ..observability import log_event
22
+
23
+ # scanned samples that have BOTH a .txt sidecar (reference text) and a gt.json, AND
24
+ # that genuinely exercise each OCR engine independently (different CER per backend).
25
+ DEFAULT_SAMPLES = [
26
+ "invoice_scanned_basic", "receipt_scanned", "po_scanned",
27
+ "contract_scanned", "subscription_memo_scanned",
28
+ ]
29
+ # field-accuracy-only (no sidecar reference text). The extreme tier
30
+ # (scripts/generate_extreme_docs.py — perspective photo, image collage, degraded fax) is a
31
+ # VISION-extraction stress set: on those images the OCR engines fall back to a shared text
32
+ # source (identical CER across backends), so they are excluded from the per-backend CER
33
+ # benchmark and scored on field accuracy only. complex_invoice_messy requires a real VLM.
34
+ FIELD_ONLY_SAMPLES = ["complex_invoice_messy"]
35
+
36
+
37
+ def _norm(s: str) -> str:
38
+ return re.sub(r"\s+", " ", (s or "").strip().lower())
39
+
40
+
41
+ def _lev(a, b) -> int:
42
+ """Levenshtein distance over any sequences (str or list)."""
43
+ if a == b:
44
+ return 0
45
+ la, lb = len(a), len(b)
46
+ if la == 0:
47
+ return lb
48
+ if lb == 0:
49
+ return la
50
+ prev = list(range(lb + 1))
51
+ for i in range(1, la + 1):
52
+ cur = [i]
53
+ ca = a[i - 1]
54
+ for j in range(1, lb + 1):
55
+ cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != b[j - 1])))
56
+ prev = cur
57
+ return prev[lb]
58
+
59
+
60
+ def cer(hyp: str, ref: str):
61
+ ref, hyp = _norm(ref), _norm(hyp)
62
+ if not ref:
63
+ return None
64
+ return round(min(1.0, _lev(hyp, ref) / len(ref)), 4)
65
+
66
+
67
+ def wer(hyp: str, ref: str):
68
+ rw, hw = _norm(ref).split(), _norm(hyp).split()
69
+ if not rw:
70
+ return None
71
+ return round(min(1.0, _lev(hw, rw) / len(rw)), 4)
72
+
73
+
74
+ def _model_size(settings, backend_name: str):
75
+ """Map an OCR backend to its model name + size from the model registry."""
76
+ from ..models_registry import model_catalog
77
+ cat = model_catalog(settings)
78
+ flat = [{**m, "lab": lab["lab"]} for lab in cat["labs"] for m in lab["models"]]
79
+ if backend_name == "minicpm":
80
+ m = next((x for x in flat if x["name"].startswith("MiniCPM-V")), None)
81
+ elif backend_name == "cohere":
82
+ m = next((x for x in flat if x["name"].startswith("Aya-Vision-8")), None)
83
+ else:
84
+ m = None
85
+ if m:
86
+ return {"model": m["name"], "params_b": m["params_b"], "size_gb": m["size_gb_int4"], "lab": m["lab"]}
87
+ return {"model": backend_name, "params_b": None, "size_gb": None, "lab": "classic"}
88
+
89
+
90
+ def run_ocr_quality(settings, ocr_registry, router, metrics, db=None, rag_store=None,
91
+ samples=None) -> dict:
92
+ from ..pipeline import process_document
93
+ from evals import scorers
94
+
95
+ samples = samples or DEFAULT_SAMPLES
96
+ ds = settings.evals_dataset_dir
97
+ available = [n for n in ocr_registry.available_names() if n != "sidecar"] + ["sidecar"]
98
+
99
+ log_event("info", "OCR quality benchmark started",
100
+ backends=available, samples=samples + FIELD_ONLY_SAMPLES)
101
+
102
+ backend_rows = []
103
+ for bname in available:
104
+ backend = ocr_registry.get(bname)
105
+ if not backend or not backend.available():
106
+ continue
107
+ cers, wers, exacts, f1s, lats, costs = [], [], [], [], [], []
108
+ per_sample = []
109
+ for sid in samples + FIELD_ONLY_SAMPLES:
110
+ doc = _find(ds, sid)
111
+ if not doc:
112
+ continue
113
+ gt = _load_gt(ds, sid)
114
+ t0 = time.perf_counter()
115
+ run = process_document(doc, router=router, settings=settings, metrics=metrics,
116
+ ocr_registry=ocr_registry, ocr_backend=bname,
117
+ db=db, rag_store=rag_store, doc_id=f"q-{bname}-{sid}",
118
+ mode="quality")
119
+ st = run["_state"]
120
+ ocr_text = (st.get("ocr") or {}).get("text", "")
121
+ ref = _ref_text(ds, sid)
122
+ c = cer(ocr_text, ref) if ref else None
123
+ w = wer(ocr_text, ref) if ref else None
124
+ score = scorers.score_document(st.get("extracted") or {},
125
+ {k: v for k, v in gt.items() if not k.startswith("_")}) if gt else {}
126
+ case = {"sample": sid, "cer": c, "wer": w,
127
+ "field_exact": score.get("exact_match"), "field_f1": score.get("field_f1"),
128
+ "latency_ms": round((time.perf_counter() - t0) * 1000, 1),
129
+ "cost_usd": run.get("total_cost_usd", 0.0),
130
+ "confidence": st.get("confidence")}
131
+ per_sample.append(case)
132
+ if c is not None:
133
+ cers.append(c); wers.append(w)
134
+ if score.get("exact_match") is not None:
135
+ exacts.append(score["exact_match"]); f1s.append(score["field_f1"] or 0)
136
+ lats.append(case["latency_ms"]); costs.append(case["cost_usd"])
137
+ log_event("info", f"OCR quality: {bname} on {sid}",
138
+ cer=c, wer=w, field_exact=score.get("exact_match"))
139
+
140
+ size = _model_size(settings, bname)
141
+ row = {
142
+ "backend": bname, **size,
143
+ "is_reference": bname == "sidecar",
144
+ "cer": _avg(cers), "wer": _avg(wers),
145
+ "field_exact_match": _avg(exacts), "field_f1": _avg(f1s),
146
+ "avg_latency_ms": _avg(lats), "avg_cost_usd": round(_avg(costs) or 0, 6),
147
+ "samples_scored": len(per_sample), "per_sample": per_sample,
148
+ }
149
+ backend_rows.append(row)
150
+
151
+ # rank: best OCR text quality (lowest CER among real engines), best analysis (highest exact)
152
+ real = [r for r in backend_rows if not r["is_reference"] and r["cer"] is not None]
153
+ best_ocr = min(real, key=lambda r: r["cer"])["backend"] if real else None
154
+ # rank only real engines for "best analysis" — sidecar is the reference text, not a
155
+ # competing OCR backend, so it must never be crowned best.
156
+ scored = [r for r in backend_rows
157
+ if not r["is_reference"] and r["field_exact_match"] is not None]
158
+ best_analysis = max(scored, key=lambda r: r["field_exact_match"])["backend"] if scored else None
159
+
160
+ report = {
161
+ "generated_at": time.time(),
162
+ "note": "CER/WER vs .txt sidecar reference; field accuracy vs gt.json. "
163
+ "sidecar = reference text source (CER≈0 by construction).",
164
+ "models": _model_size_table(settings),
165
+ "backends": backend_rows,
166
+ "best_ocr_quality": best_ocr,
167
+ "best_document_analysis": best_analysis,
168
+ }
169
+ if db is not None:
170
+ try:
171
+ db.audit("ocr_quality_benchmark",
172
+ detail={"best_ocr": best_ocr, "best_analysis": best_analysis,
173
+ "backends": [r["backend"] for r in backend_rows]})
174
+ except Exception:
175
+ pass
176
+ log_event("info", "OCR quality benchmark complete",
177
+ best_ocr=best_ocr, best_analysis=best_analysis)
178
+ return report
179
+
180
+
181
+ def _model_size_table(settings):
182
+ from ..models_registry import model_catalog
183
+ return [{"lab": lab["lab"], **{k: m[k] for k in ("name", "params_b", "size_gb_int4", "modality", "role", "available")}}
184
+ for lab in model_catalog(settings)["labs"] for m in lab["models"]]
185
+
186
+
187
+ def _avg(xs):
188
+ xs = [x for x in xs if x is not None]
189
+ return round(sum(xs) / len(xs), 4) if xs else None
190
+
191
+
192
+ def _find(ds: Path, sid: str):
193
+ for ext in (".png", ".jpg", ".jpeg", ".pdf"):
194
+ p = ds / f"{sid}{ext}"
195
+ if p.exists():
196
+ return p
197
+ return None
198
+
199
+
200
+ def _ref_text(ds: Path, sid: str):
201
+ p = ds / f"{sid}.txt"
202
+ return p.read_text(encoding="utf-8", errors="ignore") if p.exists() else None
203
+
204
+
205
+ def _load_gt(ds: Path, sid: str):
206
+ p = ds / f"{sid}.gt.json"
207
+ return json.loads(p.read_text()) if p.exists() else None
backend/app/pipeline/nodes.py CHANGED
@@ -139,14 +139,30 @@ def classify_node(state: dict, ctx: PipelineContext) -> dict:
139
  )
140
  resp = ctx.router.run(req, ctx.run_id)
141
  parsed = _parse_json(resp.text)
142
- doc_type = state.get("forced_doc_type") or parsed.get("doc_type", "invoice")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  if doc_type not in SCHEMA_BY_TYPE:
144
  doc_type = "invoice"
145
- conf = float(parsed.get("confidence", 0.6) or 0.6)
146
  return {
147
  "doc_type": doc_type,
148
  "classify_confidence": conf,
149
- "_summary": f"Classified as '{doc_type}' (conf={conf:.2f}) via {resp.model}",
150
  }
151
 
152
 
@@ -260,6 +276,14 @@ def normalize_node(state: dict, ctx: PipelineContext) -> dict:
260
  resp = ctx.router.run(req, ctx.run_id)
261
  normalized = _parse_json(resp.text) or extracted
262
  normalized = _coerce_to_schema(normalized, state["doc_type"])
 
 
 
 
 
 
 
 
263
  return {
264
  "normalized": normalized,
265
  "_summary": f"Normalized dates/currency/amounts via {resp.model}",
 
139
  )
140
  resp = ctx.router.run(req, ctx.run_id)
141
  parsed = _parse_json(resp.text)
142
+ llm_type = parsed.get("doc_type", "invoice")
143
+ conf = float(parsed.get("confidence", 0.6) or 0.6)
144
+
145
+ # Cross-check the LLM against the deterministic keyword heuristic. Small VLMs
146
+ # (8B) reliably confuse receipts/POs with invoices because totals+tax lines
147
+ # look alike; the heuristic keys off unambiguous markers ("receipt", "purchase
148
+ # order"). When the heuristic is highly confident and disagrees, it wins.
149
+ note = ""
150
+ try:
151
+ from ..extraction_heuristics import classify as _heur
152
+ h_type, h_conf = _heur(text)
153
+ if h_type in SCHEMA_BY_TYPE and h_type != llm_type and h_conf >= 0.85:
154
+ note = f" (heuristic override: {llm_type}→{h_type})"
155
+ llm_type, conf = h_type, max(conf, h_conf)
156
+ except Exception:
157
+ pass
158
+
159
+ doc_type = state.get("forced_doc_type") or llm_type
160
  if doc_type not in SCHEMA_BY_TYPE:
161
  doc_type = "invoice"
 
162
  return {
163
  "doc_type": doc_type,
164
  "classify_confidence": conf,
165
+ "_summary": f"Classified as '{doc_type}' (conf={conf:.2f}) via {resp.model}{note}",
166
  }
167
 
168
 
 
276
  resp = ctx.router.run(req, ctx.run_id)
277
  normalized = _parse_json(resp.text) or extracted
278
  normalized = _coerce_to_schema(normalized, state["doc_type"])
279
+ # Deterministic date hygiene: small LLMs sometimes leave a stray time component
280
+ # ("2026-06-02 14:37") or a non-ISO format on date fields — coerce to YYYY-MM-DD.
281
+ from ..extraction_heuristics import normalize_date as _nd
282
+ for k, v in list(normalized.items()):
283
+ if "date" in k and isinstance(v, str) and v:
284
+ iso = _nd(v)
285
+ if iso:
286
+ normalized[k] = iso
287
  return {
288
  "normalized": normalized,
289
  "_summary": f"Normalized dates/currency/amounts via {resp.model}",
backend/app/prompts/__init__.py CHANGED
@@ -10,13 +10,21 @@ into these prompts.
10
  """
11
  from __future__ import annotations
12
 
13
- PROMPT_VERSION = "v1"
14
 
15
  CLASSIFY_SYSTEM = """You are a document classification assistant for an enterprise \
16
  accounts-payable pipeline.
17
 
18
  Given the first portion of a document, classify it as exactly one of:
19
- invoice | purchase_order | contract | receipt | other
 
 
 
 
 
 
 
 
20
 
21
  Return ONLY a JSON object, no prose:
22
  {"doc_type": "<one of the above>", "confidence": <0.0-1.0>, "language": "<iso-639-1>"}
 
10
  """
11
  from __future__ import annotations
12
 
13
+ PROMPT_VERSION = "v2" # v2: receipt/invoice distinctions + subscription_memo in classify
14
 
15
  CLASSIFY_SYSTEM = """You are a document classification assistant for an enterprise \
16
  accounts-payable pipeline.
17
 
18
  Given the first portion of a document, classify it as exactly one of:
19
+ invoice | purchase_order | contract | receipt | subscription_memo | other
20
+
21
+ Distinctions that matter:
22
+ - receipt = proof of a COMPLETED payment: merchant/store header, register or
23
+ receipt number, payment method (VISA/cash/card), often "thank you". No due date.
24
+ - invoice = a REQUEST for future payment: invoice number, due date, bill-to,
25
+ remit-to. The word "Invoice" itself usually appears.
26
+ - purchase_order = a buyer ordering goods: PO number, ship-to, delivery date.
27
+ - subscription_memo = recurring service notice: plan, billing cycle, renewal date.
28
 
29
  Return ONLY a JSON object, no prose:
30
  {"doc_type": "<one of the above>", "confidence": <0.0-1.0>, "language": "<iso-639-1>"}
backend/app/providers/blackforest.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Black Forest Labs (FLUX) — image GENERATION provider.
2
+
3
+ FLUX is a ≤12B text-to-image model — NOT an OCR/VLM. In this pipeline its role is to
4
+ generate **synthetic test documents** (e.g. a noisy receipt photo) that we then run the
5
+ OCR backends against, to grow the quality benchmark beyond hand-built samples.
6
+
7
+ Backends (gated, graceful):
8
+ • BFL hosted API (api.bfl.ml) — set BFL_API_KEY
9
+ • local diffusers — pip install diffusers torch (heavy)
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import importlib.util
14
+ import json
15
+ import time
16
+ import urllib.request
17
+
18
+
19
+ class BlackForestProvider:
20
+ name = "blackforest"
21
+
22
+ def __init__(self, api_key: str | None, model: str = "flux-dev") -> None:
23
+ self.api_key = api_key
24
+ self.model = model
25
+
26
+ def available(self) -> bool:
27
+ return bool(self.api_key) or importlib.util.find_spec("diffusers") is not None
28
+
29
+ def generate_document(self, prompt: str, width: int = 1024, height: int = 1408,
30
+ timeout: int = 60) -> bytes:
31
+ """Return PNG bytes of a generated synthetic document image."""
32
+ if self.api_key:
33
+ return self._via_api(prompt, width, height, timeout)
34
+ if importlib.util.find_spec("diffusers"):
35
+ return self._via_diffusers(prompt, width, height)
36
+ raise RuntimeError("Black Forest Labs not enabled (set BFL_API_KEY or install diffusers)")
37
+
38
+ def _via_api(self, prompt, width, height, timeout) -> bytes:
39
+ from .minicpm_llm import _parse_json # reuse robust JSON parse
40
+ from ..ocr.backends.minicpm import _ssl_context
41
+ body = json.dumps({"prompt": prompt, "width": width, "height": height}).encode()
42
+ req = urllib.request.Request(
43
+ f"https://api.bfl.ml/v1/{self.model}", data=body,
44
+ headers={"Content-Type": "application/json", "x-key": self.api_key})
45
+ with urllib.request.urlopen(req, timeout=timeout, context=_ssl_context()) as r:
46
+ poll_id = json.loads(r.read().decode())["id"]
47
+ # poll for the result
48
+ deadline = time.time() + timeout
49
+ while time.time() < deadline:
50
+ pr = urllib.request.Request(f"https://api.bfl.ml/v1/get_result?id={poll_id}",
51
+ headers={"x-key": self.api_key})
52
+ with urllib.request.urlopen(pr, timeout=timeout, context=_ssl_context()) as r:
53
+ res = json.loads(r.read().decode())
54
+ if res.get("status") == "Ready":
55
+ url = res["result"]["sample"]
56
+ with urllib.request.urlopen(url, timeout=timeout, context=_ssl_context()) as img:
57
+ return img.read()
58
+ time.sleep(1.5)
59
+ raise TimeoutError("FLUX generation timed out")
60
+
61
+ def _via_diffusers(self, prompt, width, height) -> bytes:
62
+ import io
63
+ from diffusers import FluxPipeline
64
+ import torch
65
+ pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell",
66
+ torch_dtype=torch.bfloat16)
67
+ img = pipe(prompt, width=width, height=height, num_inference_steps=4).images[0]
68
+ buf = io.BytesIO()
69
+ img.save(buf, format="PNG")
70
+ return buf.getvalue()
backend/evals/datasets/extreme_contract_fax.gt.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_type": "contract",
3
+ "contract_number": "MSA-2026-0481",
4
+ "title": "Master Services Agreement - Store Fit-Out Program",
5
+ "party_a": "Aperture Retail Group",
6
+ "party_b": "Halcyon Build Partners LLC",
7
+ "effective_date": "2026-03-01",
8
+ "expiration_date": "2029-02-28",
9
+ "contract_value": 1250000.0,
10
+ "currency": "USD",
11
+ "governing_law": "State of Ohio",
12
+ "auto_renew": false,
13
+ "termination_notice_days": 60,
14
+ "_meta": {
15
+ "doc_type": "contract",
16
+ "channel": "fax",
17
+ "difficulty": "extreme",
18
+ "skip_eval": true
19
+ }
20
+ }
backend/evals/datasets/extreme_contract_fax.png ADDED

Git LFS Details

  • SHA256: 312bded6c42d167ad483382c9e5ba5363e65758fc4d25c03aa8478bc9963943e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.8 MB
backend/evals/datasets/extreme_contract_fax.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MASTER SERVICES AGREEMENT - STORE FIT-OUT PROGRAM
2
+ Contract No: MSA-2026-0481
3
+ Party A: Aperture Retail Group Party B: Halcyon Build Partners LLC
4
+ Effective Date: 2026-03-01 Expiration Date: 2029-02-28
5
+ Total Contract Value: USD 1,250,000.00 Governing Law: State of Ohio
6
+ Auto-Renewal: NO Termination Notice: 60 days written notice
7
+ 1. SCOPE. Contractor shall furnish all labor, materials, supervision and
8
+ equipment required for the fit-out of retail premises identified in each
9
+ Statement of Work executed under this Agreement.
10
+ 2. TERM. This Agreement commences on the Effective Date and continues
11
+ until the Expiration Date unless terminated earlier per Section 9.
12
+ 3. COMPENSATION. Client shall pay Contractor fees not to exceed the
13
+ Total Contract Value, payable per approved milestone invoices Net 30.
14
+ 4. CHANGE ORDERS. No variation is binding unless documented in a
15
+ written change order signed by both parties' authorized representatives.
16
+ 5. WARRANTIES. Contractor warrants workmanship free of defects for
17
+ twenty-four (24) months following practical completion of each site.
18
+ 6. INSURANCE. Contractor shall maintain commercial general liability
19
+ coverage of not less than USD 5,000,000 per occurrence.
20
+ 7. CONFIDENTIALITY. Each party shall protect Confidential Information
21
+ with no less than reasonable care and use it solely for this Agreement.
22
+ 8. LIABILITY. Neither party is liable for indirect or consequential
23
+ damages; aggregate liability is capped at the Total Contract Value.
24
+ 9. TERMINATION. Either party may terminate for convenience upon sixty
25
+ (60) days written notice, or immediately for uncured material breach.
26
+ 10. GOVERNING LAW. This Agreement is governed by the laws of the
27
+ State of Ohio, excluding its conflict of law provisions.
backend/evals/datasets/extreme_po_collage.gt.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_type": "purchase_order",
3
+ "order_number": "PO-77RX-3309",
4
+ "order_date": "2026-05-21",
5
+ "delivery_date": "2026-06-15",
6
+ "vendor_name": "Nordic Fixture Works AB",
7
+ "buyer_name": "Aperture Retail Group",
8
+ "ship_to": "DC-7, 4420 Logistics Pkwy, Columbus OH",
9
+ "currency": "USD",
10
+ "payment_terms": "Net 45",
11
+ "subtotal": 9600.0,
12
+ "tax_amount": 792.0,
13
+ "total": 10392.0,
14
+ "line_items": [
15
+ {
16
+ "description": "SHELF UNIT S-200 heavy gauge",
17
+ "quantity": 24,
18
+ "unit_price": 189.0,
19
+ "line_total": 4536.0
20
+ },
21
+ {
22
+ "description": "LED STRIP 2m retail white",
23
+ "quantity": 60,
24
+ "unit_price": 22.4,
25
+ "line_total": 1344.0
26
+ },
27
+ {
28
+ "description": "ENDCAP DISPLAY birch finish",
29
+ "quantity": 12,
30
+ "unit_price": 310.0,
31
+ "line_total": 3720.0
32
+ }
33
+ ],
34
+ "_meta": {
35
+ "doc_type": "purchase_order",
36
+ "channel": "scanned",
37
+ "difficulty": "extreme",
38
+ "skip_eval": true
39
+ }
40
+ }
backend/evals/datasets/extreme_po_collage.png ADDED

Git LFS Details

  • SHA256: 78dd797bfa834ef942c6a2d25bec3aa0ced5d21e5dfe45252f55be95a1ef989a
  • Pointer size: 132 Bytes
  • Size of remote file: 3.67 MB
backend/evals/datasets/extreme_po_collage.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PURCHASE ORDER
2
+ Nordic Fixture Works AB
3
+ Industrigatan 14, Malmo SE
4
+ PO Number: PO-77RX-3309
5
+ Order Date: 2026-05-21
6
+ Delivery Date: 2026-06-15
7
+ Payment Terms: Net 45
8
+ Currency: USD
9
+ Buyer: Aperture Retail Group
10
+ Ship To: DC-7, 4420 Logistics Pkwy, Columbus OH
11
+ IMG DESCRIPTION QTY UNIT USD AMOUNT
12
+ SHELF UNIT S-200 heavy gauge 24 189.00 4,536.00
13
+ LED STRIP 2m retail white 60 22.40 1,344.00
14
+ ENDCAP DISPLAY birch finish 12 310.00 3,720.00
15
+ Subtotal: 9,600.00
16
+ Tax 8.25%: 792.00
17
+ TOTAL: 10,392.00 USD
18
+ *PO77RX3309*
19
+ APPROVED · OPS DESK
20
+ Authorized — K. Lindqvist, Procurement
backend/evals/datasets/extreme_receipt_photo.gt.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_type": "receipt",
3
+ "merchant": "BREW & BEAN COFFEE Co.",
4
+ "date": "2026-06-02",
5
+ "currency": "USD",
6
+ "subtotal": 30.75,
7
+ "tax_amount": 2.71,
8
+ "total": 33.46,
9
+ "payment_method": "VISA ****4421",
10
+ "line_items": [
11
+ {
12
+ "description": "Flat White",
13
+ "quantity": 2,
14
+ "unit_price": 4.75,
15
+ "line_total": 9.5
16
+ },
17
+ {
18
+ "description": "Butter Croissant",
19
+ "quantity": 3,
20
+ "unit_price": 3.25,
21
+ "line_total": 9.75
22
+ },
23
+ {
24
+ "description": "Cold Brew Growler",
25
+ "quantity": 1,
26
+ "unit_price": 14.0,
27
+ "line_total": 14.0
28
+ }
29
+ ],
30
+ "_meta": {
31
+ "doc_type": "receipt",
32
+ "channel": "photo",
33
+ "difficulty": "extreme",
34
+ "skip_eval": true
35
+ }
36
+ }
backend/evals/datasets/extreme_receipt_photo.png ADDED

Git LFS Details

  • SHA256: cbc2ac82892034a3ec5396ca7bab5ee9531bea1c47a1a1f237e7543f6d4a2633
  • Pointer size: 132 Bytes
  • Size of remote file: 2.29 MB
backend/evals/datasets/extreme_receipt_photo.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BREW & BEAN COFFEE Co.
2
+ 412 Harbor Lane, Portland OR
3
+ Receipt #R-88341 Reg 02
4
+ Date: 2026-06-02 14:37
5
+ Currency: USD
6
+ --------------------------------
7
+ Flat White 2 x 4.75 9.50
8
+ Butter Croissant 3 x 3.25 9.75
9
+ Cold Brew Growler 1 x 14.00 14.00
10
+ Loyalty discount -2.50
11
+ --------------------------------
12
+ Subtotal 30.75
13
+ Tax 8.8% 2.71
14
+ TOTAL 33.46
15
+ Payment: VISA ****4421
16
+ --------------------------------
17
+ Thank you! brewandbean.example
backend/evals/ocr_backend_report.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generated_at": 1781325013.2385988,
3
+ "mode": "prototype",
4
+ "samples": [
5
+ "invoice_scanned_basic",
6
+ "po_scanned"
7
+ ],
8
+ "available_backends": [
9
+ "tesseract",
10
+ "sidecar"
11
+ ],
12
+ "functional_backends": [
13
+ "tesseract",
14
+ "sidecar"
15
+ ],
16
+ "backends": [
17
+ {
18
+ "name": "minicpm",
19
+ "tier": "vlm",
20
+ "requires": "MINICPM_BASE_URL (+ MINICPM_API_KEY) \u2014 vLLM/llama.cpp serving MiniCPM-V-4.6",
21
+ "available": false,
22
+ "tested": false,
23
+ "ok": false,
24
+ "cases": []
25
+ },
26
+ {
27
+ "name": "cohere",
28
+ "tier": "local",
29
+ "requires": "transformers + COHERE_OCR_ENABLE=true (downloads COHERE_OCR_MODEL)",
30
+ "available": false,
31
+ "tested": false,
32
+ "ok": false,
33
+ "cases": []
34
+ },
35
+ {
36
+ "name": "llamaparse",
37
+ "tier": "api",
38
+ "requires": "llama-cloud-services + LLAMA_CLOUD_API_KEY",
39
+ "available": false,
40
+ "tested": false,
41
+ "ok": false,
42
+ "cases": []
43
+ },
44
+ {
45
+ "name": "tesseract",
46
+ "tier": "local",
47
+ "requires": "pytesseract + tesseract binary",
48
+ "available": true,
49
+ "tested": true,
50
+ "ok": true,
51
+ "cases": [
52
+ {
53
+ "sample": "invoice_scanned_basic",
54
+ "ok": true,
55
+ "engine": "tesseract",
56
+ "chars": 257,
57
+ "simulated": false,
58
+ "expected_found": [
59
+ "INVOICE",
60
+ "NORTHWIND",
61
+ "INV-7741",
62
+ "CONTOSO"
63
+ ],
64
+ "latency_ms": 412.7,
65
+ "excerpt": "INVOICE\n\nInvoice Number: INV-7741\nInvoice Date: 2026-03-22\nDue Date: 2026-04-21\nFrom: Northwind Traders\nBill To: Contoso Ltd\n\nCurrency: USD\nDescription aty unit",
66
+ "error": null
67
+ },
68
+ {
69
+ "sample": "po_scanned",
70
+ "ok": true,
71
+ "engine": "tesseract",
72
+ "chars": 329,
73
+ "simulated": false,
74
+ "expected_found": [
75
+ "PURCHASE",
76
+ "INITECH"
77
+ ],
78
+ "latency_ms": 440.5,
79
+ "excerpt": "PURCHASE ORDER\n\nPurchase Order Number: P0-100483\norder Date: 2026-04-11\n\nDelivery Date: 2026-05-01\n\nVendor: Initech Supplies\n\nBuyer: Contoso Ops\n\nShip To: 9 Mar",
80
+ "error": null
81
+ }
82
+ ]
83
+ },
84
+ {
85
+ "name": "easyocr",
86
+ "tier": "local",
87
+ "requires": "easyocr",
88
+ "available": false,
89
+ "tested": false,
90
+ "ok": false,
91
+ "cases": []
92
+ },
93
+ {
94
+ "name": "sidecar",
95
+ "tier": "offline",
96
+ "requires": "nothing (reads <stem>.txt sidecar)",
97
+ "available": true,
98
+ "tested": true,
99
+ "ok": true,
100
+ "cases": [
101
+ {
102
+ "sample": "invoice_scanned_basic",
103
+ "ok": true,
104
+ "engine": "sidecar-fallback",
105
+ "chars": 323,
106
+ "simulated": true,
107
+ "expected_found": [
108
+ "INVOICE",
109
+ "NORTHWIND",
110
+ "INV-7741",
111
+ "CONTOSO"
112
+ ],
113
+ "latency_ms": 0.3,
114
+ "excerpt": "INVOICE\n\nInvoice Number: INV-7741\nInvoice Date: 2026-03-22\nDue Date: 2026-04-21\nFrom: Northwind Traders\nBill To: Contoso Ltd\nCurrency: USD\n\nDescription ",
115
+ "error": null
116
+ },
117
+ {
118
+ "sample": "po_scanned",
119
+ "ok": true,
120
+ "engine": "sidecar-fallback",
121
+ "chars": 389,
122
+ "simulated": true,
123
+ "expected_found": [
124
+ "PURCHASE",
125
+ "INITECH",
126
+ "PO-100483"
127
+ ],
128
+ "latency_ms": 0.2,
129
+ "excerpt": "PURCHASE ORDER\n\nPurchase Order Number: PO-100483\nOrder Date: 2026-04-11\nDelivery Date: 2026-05-01\nVendor: Initech Supplies\nBuyer: Contoso Ops\nShip To: 9 Market ",
130
+ "error": null
131
+ }
132
+ ]
133
+ }
134
+ ]
135
+ }
backend/evals/ocr_quality_report.json ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generated_at": 1781326743.1110458,
3
+ "note": "CER/WER vs .txt sidecar reference; field accuracy vs gt.json. sidecar = reference text source (CER\u22480 by construction).",
4
+ "models": [
5
+ {
6
+ "lab": "OpenBMB",
7
+ "name": "MiniCPM-V-4.6",
8
+ "params_b": 8.0,
9
+ "size_gb_int4": 5.5,
10
+ "modality": "vision-language (OCR + reasoning)",
11
+ "role": "OCR backend + LLM extractor",
12
+ "available": true
13
+ },
14
+ {
15
+ "lab": "OpenBMB",
16
+ "name": "MiniCPM-o-4.5",
17
+ "params_b": 8.0,
18
+ "size_gb_int4": 5.5,
19
+ "modality": "omni (vision/audio) VLM",
20
+ "role": "alt OCR/VLM",
21
+ "available": true
22
+ },
23
+ {
24
+ "lab": "OpenBMB",
25
+ "name": "MiniCPM3-4B",
26
+ "params_b": 4.0,
27
+ "size_gb_int4": 2.8,
28
+ "modality": "text LLM (reasoning + function-calling, 32k ctx)",
29
+ "role": "ERP reasoning \u00b7 NLQ\u2192SQL \u00b7 report summarization (fine-tune target)",
30
+ "available": true
31
+ },
32
+ {
33
+ "lab": "Cohere",
34
+ "name": "Aya-Vision-8B",
35
+ "params_b": 8.0,
36
+ "size_gb_int4": 6.0,
37
+ "modality": "vision-language (OCR/VQA, 23 langs)",
38
+ "role": "OCR backend",
39
+ "available": false
40
+ },
41
+ {
42
+ "lab": "Cohere",
43
+ "name": "Aya-Vision-32B",
44
+ "params_b": 32.0,
45
+ "size_gb_int4": 18.0,
46
+ "modality": "vision-language (OCR/VQA)",
47
+ "role": "alt OCR backend (max-quality small)",
48
+ "available": false
49
+ },
50
+ {
51
+ "lab": "Cohere",
52
+ "name": "Command R7B",
53
+ "params_b": 7.0,
54
+ "size_gb_int4": 5.0,
55
+ "modality": "text LLM (RAG + tool-use + reasoning, 128k ctx)",
56
+ "role": "ERP RAG \u00b7 NLQ \u00b7 grounded reasoning",
57
+ "available": false
58
+ },
59
+ {
60
+ "lab": "Black Forest Labs",
61
+ "name": "FLUX.1 [dev]",
62
+ "params_b": 12.0,
63
+ "size_gb_int4": 12.0,
64
+ "modality": "text-to-image GENERATION",
65
+ "role": "synthetic test-document generator (not OCR)",
66
+ "available": false
67
+ },
68
+ {
69
+ "lab": "Black Forest Labs",
70
+ "name": "FLUX.1 [schnell]",
71
+ "params_b": 12.0,
72
+ "size_gb_int4": 12.0,
73
+ "modality": "text-to-image GENERATION (fast)",
74
+ "role": "synthetic test-document generator",
75
+ "available": false
76
+ }
77
+ ],
78
+ "backends": [
79
+ {
80
+ "backend": "minicpm",
81
+ "model": "MiniCPM-V-4.6",
82
+ "params_b": 8.0,
83
+ "size_gb": 5.5,
84
+ "lab": "OpenBMB",
85
+ "is_reference": false,
86
+ "cer": 0.0262,
87
+ "wer": 0.0876,
88
+ "field_exact_match": 0.907,
89
+ "field_f1": 0.9397,
90
+ "avg_latency_ms": 6524.8167,
91
+ "avg_cost_usd": 0.0002,
92
+ "samples_scored": 6,
93
+ "per_sample": [
94
+ {
95
+ "sample": "invoice_scanned_basic",
96
+ "cer": 0.0,
97
+ "wer": 0.0,
98
+ "field_exact": 0.889,
99
+ "field_f1": 0.9,
100
+ "latency_ms": 5560.8,
101
+ "cost_usd": 0.0001952,
102
+ "confidence": 0.7
103
+ },
104
+ {
105
+ "sample": "receipt_scanned",
106
+ "cer": 0.0942,
107
+ "wer": 0.3103,
108
+ "field_exact": 1.0,
109
+ "field_f1": 1.0,
110
+ "latency_ms": 4218.7,
111
+ "cost_usd": 0.0001883,
112
+ "confidence": 0.98
113
+ },
114
+ {
115
+ "sample": "po_scanned",
116
+ "cer": 0.0368,
117
+ "wer": 0.1277,
118
+ "field_exact": 1.0,
119
+ "field_f1": 1.0,
120
+ "latency_ms": 4404.9,
121
+ "cost_usd": 0.0001835,
122
+ "confidence": 0.98
123
+ },
124
+ {
125
+ "sample": "contract_scanned",
126
+ "cer": 0.0,
127
+ "wer": 0.0,
128
+ "field_exact": 0.636,
129
+ "field_f1": 0.8,
130
+ "latency_ms": 6532.2,
131
+ "cost_usd": 0.000166,
132
+ "confidence": 0.98
133
+ },
134
+ {
135
+ "sample": "subscription_memo_scanned",
136
+ "cer": 0.0,
137
+ "wer": 0.0,
138
+ "field_exact": 0.917,
139
+ "field_f1": 0.938,
140
+ "latency_ms": 5010.4,
141
+ "cost_usd": 0.0001924,
142
+ "confidence": 0.98
143
+ },
144
+ {
145
+ "sample": "complex_invoice_messy",
146
+ "cer": null,
147
+ "wer": null,
148
+ "field_exact": 1.0,
149
+ "field_f1": 1.0,
150
+ "latency_ms": 13421.9,
151
+ "cost_usd": 0.0004414,
152
+ "confidence": 0.98
153
+ }
154
+ ]
155
+ },
156
+ {
157
+ "backend": "tesseract",
158
+ "model": "tesseract",
159
+ "params_b": null,
160
+ "size_gb": null,
161
+ "lab": "classic",
162
+ "is_reference": false,
163
+ "cer": 0.1468,
164
+ "wer": 0.1848,
165
+ "field_exact_match": 0.907,
166
+ "field_f1": 0.9397,
167
+ "avg_latency_ms": 3436.8667,
168
+ "avg_cost_usd": 0.0001,
169
+ "samples_scored": 6,
170
+ "per_sample": [
171
+ {
172
+ "sample": "invoice_scanned_basic",
173
+ "cer": 0.1225,
174
+ "wer": 0.1389,
175
+ "field_exact": 0.889,
176
+ "field_f1": 0.9,
177
+ "latency_ms": 3698.6,
178
+ "cost_usd": 0.0001242,
179
+ "confidence": 0.68
180
+ },
181
+ {
182
+ "sample": "receipt_scanned",
183
+ "cer": 0.4555,
184
+ "wer": 0.5172,
185
+ "field_exact": 1.0,
186
+ "field_f1": 1.0,
187
+ "latency_ms": 2861.1,
188
+ "cost_usd": 0.0001207,
189
+ "confidence": 0.96
190
+ },
191
+ {
192
+ "sample": "po_scanned",
193
+ "cer": 0.0951,
194
+ "wer": 0.1489,
195
+ "field_exact": 1.0,
196
+ "field_f1": 1.0,
197
+ "latency_ms": 3390.6,
198
+ "cost_usd": 0.000118,
199
+ "confidence": 0.96
200
+ },
201
+ {
202
+ "sample": "contract_scanned",
203
+ "cer": 0.0,
204
+ "wer": 0.0,
205
+ "field_exact": 0.636,
206
+ "field_f1": 0.8,
207
+ "latency_ms": 2336.4,
208
+ "cost_usd": 7.97e-05,
209
+ "confidence": 0.96
210
+ },
211
+ {
212
+ "sample": "subscription_memo_scanned",
213
+ "cer": 0.061,
214
+ "wer": 0.119,
215
+ "field_exact": 0.917,
216
+ "field_f1": 0.938,
217
+ "latency_ms": 2243.8,
218
+ "cost_usd": 0.0001211,
219
+ "confidence": 0.96
220
+ },
221
+ {
222
+ "sample": "complex_invoice_messy",
223
+ "cer": null,
224
+ "wer": null,
225
+ "field_exact": 1.0,
226
+ "field_f1": 1.0,
227
+ "latency_ms": 6090.7,
228
+ "cost_usd": 0.0002828,
229
+ "confidence": 0.96
230
+ }
231
+ ]
232
+ },
233
+ {
234
+ "backend": "sidecar",
235
+ "model": "sidecar",
236
+ "params_b": null,
237
+ "size_gb": null,
238
+ "lab": "classic",
239
+ "is_reference": true,
240
+ "cer": 0.0,
241
+ "wer": 0.0,
242
+ "field_exact_match": 0.907,
243
+ "field_f1": 0.9397,
244
+ "avg_latency_ms": 3235.3167,
245
+ "avg_cost_usd": 0.0001,
246
+ "samples_scored": 6,
247
+ "per_sample": [
248
+ {
249
+ "sample": "invoice_scanned_basic",
250
+ "cer": 0.0,
251
+ "wer": 0.0,
252
+ "field_exact": 0.889,
253
+ "field_f1": 0.9,
254
+ "latency_ms": 1697.6,
255
+ "cost_usd": 9.45e-05,
256
+ "confidence": 0.66
257
+ },
258
+ {
259
+ "sample": "receipt_scanned",
260
+ "cer": 0.0,
261
+ "wer": 0.0,
262
+ "field_exact": 1.0,
263
+ "field_f1": 1.0,
264
+ "latency_ms": 2126.2,
265
+ "cost_usd": 0.0001212,
266
+ "confidence": 0.94
267
+ },
268
+ {
269
+ "sample": "po_scanned",
270
+ "cer": 0.0,
271
+ "wer": 0.0,
272
+ "field_exact": 1.0,
273
+ "field_f1": 1.0,
274
+ "latency_ms": 2194.4,
275
+ "cost_usd": 0.0001184,
276
+ "confidence": 0.94
277
+ },
278
+ {
279
+ "sample": "contract_scanned",
280
+ "cer": 0.0,
281
+ "wer": 0.0,
282
+ "field_exact": 0.636,
283
+ "field_f1": 0.8,
284
+ "latency_ms": 1522.1,
285
+ "cost_usd": 7.97e-05,
286
+ "confidence": 0.94
287
+ },
288
+ {
289
+ "sample": "subscription_memo_scanned",
290
+ "cer": 0.0,
291
+ "wer": 0.0,
292
+ "field_exact": 0.917,
293
+ "field_f1": 0.938,
294
+ "latency_ms": 1632.7,
295
+ "cost_usd": 9.16e-05,
296
+ "confidence": 0.94
297
+ },
298
+ {
299
+ "sample": "complex_invoice_messy",
300
+ "cer": null,
301
+ "wer": null,
302
+ "field_exact": 1.0,
303
+ "field_f1": 1.0,
304
+ "latency_ms": 10238.9,
305
+ "cost_usd": 0.0002297,
306
+ "confidence": 0.98
307
+ }
308
+ ]
309
+ }
310
+ ],
311
+ "best_ocr_quality": "minicpm",
312
+ "best_document_analysis": "minicpm"
313
+ }
backend/evals/report.json ADDED
@@ -0,0 +1,980 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aggregate": {
3
+ "overall": {
4
+ "documents": 13,
5
+ "exact_match": 0.932,
6
+ "field_f1": 0.938,
7
+ "line_item_f1": 0.667,
8
+ "financial_consistency_rate": 1.0,
9
+ "doc_type_accuracy": 1.0
10
+ },
11
+ "by_type": {
12
+ "contract": {
13
+ "documents": 2,
14
+ "exact_match": 0.818,
15
+ "field_f1": 0.856,
16
+ "financial_consistency_rate": 1.0
17
+ },
18
+ "invoice": {
19
+ "documents": 5,
20
+ "exact_match": 1.0,
21
+ "field_f1": 1.0,
22
+ "financial_consistency_rate": 1.0
23
+ },
24
+ "purchase_order": {
25
+ "documents": 2,
26
+ "exact_match": 0.863,
27
+ "field_f1": 0.925,
28
+ "financial_consistency_rate": 1.0
29
+ },
30
+ "receipt": {
31
+ "documents": 2,
32
+ "exact_match": 1.0,
33
+ "field_f1": 1.0,
34
+ "financial_consistency_rate": 1.0
35
+ },
36
+ "subscription_memo": {
37
+ "documents": 2,
38
+ "exact_match": 0.875,
39
+ "field_f1": 0.817,
40
+ "financial_consistency_rate": 1.0
41
+ }
42
+ },
43
+ "by_difficulty": {
44
+ "standard": {
45
+ "documents": 10,
46
+ "exact_match": 0.911,
47
+ "field_f1": 0.92,
48
+ "financial_consistency_rate": 1.0
49
+ },
50
+ "dense_table": {
51
+ "documents": 1,
52
+ "exact_match": 1.0,
53
+ "field_f1": 1.0,
54
+ "financial_consistency_rate": 1.0
55
+ },
56
+ "multicurrency": {
57
+ "documents": 1,
58
+ "exact_match": 1.0,
59
+ "field_f1": 1.0,
60
+ "financial_consistency_rate": 1.0
61
+ },
62
+ "missing_fields": {
63
+ "documents": 1,
64
+ "exact_match": 1.0,
65
+ "field_f1": 1.0,
66
+ "financial_consistency_rate": 1.0
67
+ }
68
+ }
69
+ },
70
+ "documents": [
71
+ {
72
+ "doc_id": "contract_msa_digital",
73
+ "predicted_type": "contract",
74
+ "difficulty": "standard",
75
+ "channel": "digital",
76
+ "confidence": 1.0,
77
+ "requires_review": false,
78
+ "cost_usd": 0.0,
79
+ "score": {
80
+ "doc_type": "contract",
81
+ "exact_match": 0.909,
82
+ "field_f1": 0.897,
83
+ "precision": 1.0,
84
+ "recall": 0.812,
85
+ "fields_scored": 11,
86
+ "line_item_f1": null,
87
+ "line_items_applicable": false,
88
+ "financial_consistent": true,
89
+ "per_field": {
90
+ "contract_number": {
91
+ "exact": true,
92
+ "pred": "MSA-2026-014",
93
+ "gt": "MSA-2026-014"
94
+ },
95
+ "title": {
96
+ "exact": false,
97
+ "pred": null,
98
+ "gt": "Master Services Agreement"
99
+ },
100
+ "party_a": {
101
+ "exact": true,
102
+ "pred": "Acme Industrial Supplies",
103
+ "gt": "Acme Industrial Supplies"
104
+ },
105
+ "party_b": {
106
+ "exact": true,
107
+ "pred": "Globex Corporation",
108
+ "gt": "Globex Corporation"
109
+ },
110
+ "effective_date": {
111
+ "exact": true,
112
+ "pred": "2026-01-01",
113
+ "gt": "2026-01-01"
114
+ },
115
+ "expiration_date": {
116
+ "exact": true,
117
+ "pred": "2027-12-31",
118
+ "gt": "2027-12-31"
119
+ },
120
+ "contract_value": {
121
+ "exact": true,
122
+ "pred": 250000.0,
123
+ "gt": 250000.0
124
+ },
125
+ "currency": {
126
+ "exact": true,
127
+ "pred": "USD",
128
+ "gt": "USD"
129
+ },
130
+ "governing_law": {
131
+ "exact": true,
132
+ "pred": "Delaware",
133
+ "gt": "Delaware"
134
+ },
135
+ "auto_renew": {
136
+ "exact": true,
137
+ "pred": true,
138
+ "gt": true
139
+ },
140
+ "termination_notice_days": {
141
+ "exact": true,
142
+ "pred": 60,
143
+ "gt": 60
144
+ }
145
+ }
146
+ }
147
+ },
148
+ {
149
+ "doc_id": "contract_scanned",
150
+ "predicted_type": "contract",
151
+ "difficulty": "standard",
152
+ "channel": "scanned",
153
+ "confidence": 0.8,
154
+ "requires_review": true,
155
+ "cost_usd": 0.0,
156
+ "score": {
157
+ "doc_type": "contract",
158
+ "exact_match": 0.727,
159
+ "field_f1": 0.815,
160
+ "precision": 1.0,
161
+ "recall": 0.688,
162
+ "fields_scored": 11,
163
+ "line_item_f1": null,
164
+ "line_items_applicable": false,
165
+ "financial_consistent": true,
166
+ "per_field": {
167
+ "contract_number": {
168
+ "exact": true,
169
+ "pred": "NDA-7781",
170
+ "gt": "NDA-7781"
171
+ },
172
+ "title": {
173
+ "exact": false,
174
+ "pred": null,
175
+ "gt": "Mutual Non-Disclosure Agreement"
176
+ },
177
+ "party_a": {
178
+ "exact": true,
179
+ "pred": "Stark Components",
180
+ "gt": "Stark Components"
181
+ },
182
+ "party_b": {
183
+ "exact": true,
184
+ "pred": "Wayne Enterprises",
185
+ "gt": "Wayne Enterprises"
186
+ },
187
+ "effective_date": {
188
+ "exact": true,
189
+ "pred": "2026-03-15",
190
+ "gt": "2026-03-15"
191
+ },
192
+ "expiration_date": {
193
+ "exact": true,
194
+ "pred": "2029-03-14",
195
+ "gt": "2029-03-14"
196
+ },
197
+ "contract_value": {
198
+ "exact": false,
199
+ "pred": null,
200
+ "gt": 0.0
201
+ },
202
+ "currency": {
203
+ "exact": false,
204
+ "pred": null,
205
+ "gt": "USD"
206
+ },
207
+ "governing_law": {
208
+ "exact": true,
209
+ "pred": "New York",
210
+ "gt": "New York"
211
+ },
212
+ "auto_renew": {
213
+ "exact": true,
214
+ "pred": false,
215
+ "gt": false
216
+ },
217
+ "termination_notice_days": {
218
+ "exact": true,
219
+ "pred": 30,
220
+ "gt": 30
221
+ }
222
+ }
223
+ }
224
+ },
225
+ {
226
+ "doc_id": "invoice_acme_digital",
227
+ "predicted_type": "invoice",
228
+ "difficulty": "standard",
229
+ "channel": "digital",
230
+ "confidence": 1.0,
231
+ "requires_review": false,
232
+ "cost_usd": 0.0,
233
+ "score": {
234
+ "doc_type": "invoice",
235
+ "exact_match": 1.0,
236
+ "field_f1": 1.0,
237
+ "precision": 1.0,
238
+ "recall": 1.0,
239
+ "fields_scored": 9,
240
+ "line_item_f1": 1.0,
241
+ "line_items_applicable": true,
242
+ "financial_consistent": true,
243
+ "per_field": {
244
+ "invoice_number": {
245
+ "exact": true,
246
+ "pred": "INV-1001",
247
+ "gt": "INV-1001"
248
+ },
249
+ "issue_date": {
250
+ "exact": true,
251
+ "pred": "2026-07-15",
252
+ "gt": "2026-07-15"
253
+ },
254
+ "due_date": {
255
+ "exact": true,
256
+ "pred": "2026-08-14",
257
+ "gt": "2026-08-14"
258
+ },
259
+ "vendor_name": {
260
+ "exact": true,
261
+ "pred": "Acme Industrial Supplies",
262
+ "gt": "Acme Industrial Supplies"
263
+ },
264
+ "bill_to_name": {
265
+ "exact": true,
266
+ "pred": "Globex Corporation",
267
+ "gt": "Globex Corporation"
268
+ },
269
+ "currency": {
270
+ "exact": true,
271
+ "pred": "USD",
272
+ "gt": "USD"
273
+ },
274
+ "subtotal": {
275
+ "exact": true,
276
+ "pred": 300.0,
277
+ "gt": 300.0
278
+ },
279
+ "tax_amount": {
280
+ "exact": true,
281
+ "pred": 30.0,
282
+ "gt": 30.0
283
+ },
284
+ "total": {
285
+ "exact": true,
286
+ "pred": 330.0,
287
+ "gt": 330.0
288
+ }
289
+ }
290
+ }
291
+ },
292
+ {
293
+ "doc_id": "invoice_dense_table",
294
+ "predicted_type": "invoice",
295
+ "difficulty": "dense_table",
296
+ "channel": "digital",
297
+ "confidence": 1.0,
298
+ "requires_review": false,
299
+ "cost_usd": 0.0,
300
+ "score": {
301
+ "doc_type": "invoice",
302
+ "exact_match": 1.0,
303
+ "field_f1": 1.0,
304
+ "precision": 1.0,
305
+ "recall": 1.0,
306
+ "fields_scored": 9,
307
+ "line_item_f1": 1.0,
308
+ "line_items_applicable": true,
309
+ "financial_consistent": true,
310
+ "per_field": {
311
+ "invoice_number": {
312
+ "exact": true,
313
+ "pred": "INV-9120",
314
+ "gt": "INV-9120"
315
+ },
316
+ "issue_date": {
317
+ "exact": true,
318
+ "pred": "2026-06-01",
319
+ "gt": "2026-06-01"
320
+ },
321
+ "due_date": {
322
+ "exact": true,
323
+ "pred": "2026-07-01",
324
+ "gt": "2026-07-01"
325
+ },
326
+ "vendor_name": {
327
+ "exact": true,
328
+ "pred": "Wayne Enterprises",
329
+ "gt": "Wayne Enterprises"
330
+ },
331
+ "bill_to_name": {
332
+ "exact": true,
333
+ "pred": "Oscorp",
334
+ "gt": "Oscorp"
335
+ },
336
+ "currency": {
337
+ "exact": true,
338
+ "pred": "USD",
339
+ "gt": "USD"
340
+ },
341
+ "subtotal": {
342
+ "exact": true,
343
+ "pred": 2140.0,
344
+ "gt": 2140.0
345
+ },
346
+ "tax_amount": {
347
+ "exact": true,
348
+ "pred": 214.0,
349
+ "gt": 214.0
350
+ },
351
+ "total": {
352
+ "exact": true,
353
+ "pred": 2354.0,
354
+ "gt": 2354.0
355
+ }
356
+ }
357
+ }
358
+ },
359
+ {
360
+ "doc_id": "invoice_globalparts_eur",
361
+ "predicted_type": "invoice",
362
+ "difficulty": "multicurrency",
363
+ "channel": "digital",
364
+ "confidence": 1.0,
365
+ "requires_review": false,
366
+ "cost_usd": 0.0,
367
+ "score": {
368
+ "doc_type": "invoice",
369
+ "exact_match": 1.0,
370
+ "field_f1": 1.0,
371
+ "precision": 1.0,
372
+ "recall": 1.0,
373
+ "fields_scored": 9,
374
+ "line_item_f1": 1.0,
375
+ "line_items_applicable": true,
376
+ "financial_consistent": true,
377
+ "per_field": {
378
+ "invoice_number": {
379
+ "exact": true,
380
+ "pred": "GP-2026-558",
381
+ "gt": "GP-2026-558"
382
+ },
383
+ "issue_date": {
384
+ "exact": true,
385
+ "pred": "2026-05-03",
386
+ "gt": "2026-05-03"
387
+ },
388
+ "due_date": {
389
+ "exact": true,
390
+ "pred": "2026-06-02",
391
+ "gt": "2026-06-02"
392
+ },
393
+ "vendor_name": {
394
+ "exact": true,
395
+ "pred": "GlobalParts GmbH",
396
+ "gt": "GlobalParts GmbH"
397
+ },
398
+ "bill_to_name": {
399
+ "exact": true,
400
+ "pred": "Initech LLC",
401
+ "gt": "Initech LLC"
402
+ },
403
+ "currency": {
404
+ "exact": true,
405
+ "pred": "EUR",
406
+ "gt": "EUR"
407
+ },
408
+ "subtotal": {
409
+ "exact": true,
410
+ "pred": 1840.0,
411
+ "gt": 1840.0
412
+ },
413
+ "tax_amount": {
414
+ "exact": true,
415
+ "pred": 349.6,
416
+ "gt": 349.6
417
+ },
418
+ "total": {
419
+ "exact": true,
420
+ "pred": 2189.6,
421
+ "gt": 2189.6
422
+ }
423
+ }
424
+ }
425
+ },
426
+ {
427
+ "doc_id": "invoice_missing_total",
428
+ "predicted_type": "invoice",
429
+ "difficulty": "missing_fields",
430
+ "channel": "digital",
431
+ "confidence": 0.72,
432
+ "requires_review": true,
433
+ "cost_usd": 0.0,
434
+ "score": {
435
+ "doc_type": "invoice",
436
+ "exact_match": 1.0,
437
+ "field_f1": 1.0,
438
+ "precision": 1.0,
439
+ "recall": 1.0,
440
+ "fields_scored": 6,
441
+ "line_item_f1": 1.0,
442
+ "line_items_applicable": true,
443
+ "financial_consistent": true,
444
+ "per_field": {
445
+ "invoice_number": {
446
+ "exact": true,
447
+ "pred": "INV-3300",
448
+ "gt": "INV-3300"
449
+ },
450
+ "issue_date": {
451
+ "exact": true,
452
+ "pred": "2026-02-10",
453
+ "gt": "2026-02-10"
454
+ },
455
+ "vendor_name": {
456
+ "exact": true,
457
+ "pred": "Stark Components",
458
+ "gt": "Stark Components"
459
+ },
460
+ "currency": {
461
+ "exact": true,
462
+ "pred": "USD",
463
+ "gt": "USD"
464
+ },
465
+ "subtotal": {
466
+ "exact": true,
467
+ "pred": 1200.0,
468
+ "gt": 1200.0
469
+ },
470
+ "tax_amount": {
471
+ "exact": true,
472
+ "pred": 96.0,
473
+ "gt": 96.0
474
+ }
475
+ }
476
+ }
477
+ },
478
+ {
479
+ "doc_id": "invoice_scanned_basic",
480
+ "predicted_type": "invoice",
481
+ "difficulty": "standard",
482
+ "channel": "scanned",
483
+ "confidence": 0.8,
484
+ "requires_review": true,
485
+ "cost_usd": 0.0,
486
+ "score": {
487
+ "doc_type": "invoice",
488
+ "exact_match": 1.0,
489
+ "field_f1": 1.0,
490
+ "precision": 1.0,
491
+ "recall": 1.0,
492
+ "fields_scored": 9,
493
+ "line_item_f1": 0.0,
494
+ "line_items_applicable": true,
495
+ "financial_consistent": true,
496
+ "per_field": {
497
+ "invoice_number": {
498
+ "exact": true,
499
+ "pred": "INV-7741",
500
+ "gt": "INV-7741"
501
+ },
502
+ "issue_date": {
503
+ "exact": true,
504
+ "pred": "2026-03-22",
505
+ "gt": "2026-03-22"
506
+ },
507
+ "due_date": {
508
+ "exact": true,
509
+ "pred": "2026-04-21",
510
+ "gt": "2026-04-21"
511
+ },
512
+ "vendor_name": {
513
+ "exact": true,
514
+ "pred": "Northwind Traders",
515
+ "gt": "Northwind Traders"
516
+ },
517
+ "bill_to_name": {
518
+ "exact": true,
519
+ "pred": "Contoso Ltd",
520
+ "gt": "Contoso Ltd"
521
+ },
522
+ "currency": {
523
+ "exact": true,
524
+ "pred": "USD",
525
+ "gt": "USD"
526
+ },
527
+ "subtotal": {
528
+ "exact": true,
529
+ "pred": 540.0,
530
+ "gt": 540.0
531
+ },
532
+ "tax_amount": {
533
+ "exact": true,
534
+ "pred": 43.2,
535
+ "gt": 43.2
536
+ },
537
+ "total": {
538
+ "exact": true,
539
+ "pred": 583.2,
540
+ "gt": 583.2
541
+ }
542
+ }
543
+ }
544
+ },
545
+ {
546
+ "doc_id": "po_acme_digital",
547
+ "predicted_type": "purchase_order",
548
+ "difficulty": "standard",
549
+ "channel": "digital",
550
+ "confidence": 1.0,
551
+ "requires_review": false,
552
+ "cost_usd": 0.0,
553
+ "score": {
554
+ "doc_type": "purchase_order",
555
+ "exact_match": 0.909,
556
+ "field_f1": 0.941,
557
+ "precision": 0.941,
558
+ "recall": 0.941,
559
+ "fields_scored": 11,
560
+ "line_item_f1": 1.0,
561
+ "line_items_applicable": true,
562
+ "financial_consistent": true,
563
+ "per_field": {
564
+ "order_number": {
565
+ "exact": false,
566
+ "pred": "Purchase",
567
+ "gt": "PO-100481"
568
+ },
569
+ "order_date": {
570
+ "exact": true,
571
+ "pred": "2026-07-02",
572
+ "gt": "2026-07-02"
573
+ },
574
+ "delivery_date": {
575
+ "exact": true,
576
+ "pred": "2026-07-20",
577
+ "gt": "2026-07-20"
578
+ },
579
+ "vendor_name": {
580
+ "exact": true,
581
+ "pred": "Acme Industrial",
582
+ "gt": "Acme Industrial"
583
+ },
584
+ "buyer_name": {
585
+ "exact": true,
586
+ "pred": "Globex Procurement",
587
+ "gt": "Globex Procurement"
588
+ },
589
+ "ship_to": {
590
+ "exact": true,
591
+ "pred": "12 Industrial Way, Springfield",
592
+ "gt": "12 Industrial Way, Springfield"
593
+ },
594
+ "currency": {
595
+ "exact": true,
596
+ "pred": "USD",
597
+ "gt": "USD"
598
+ },
599
+ "subtotal": {
600
+ "exact": true,
601
+ "pred": 12000.0,
602
+ "gt": 12000.0
603
+ },
604
+ "tax_amount": {
605
+ "exact": true,
606
+ "pred": 450.0,
607
+ "gt": 450.0
608
+ },
609
+ "total": {
610
+ "exact": true,
611
+ "pred": 12450.0,
612
+ "gt": 12450.0
613
+ },
614
+ "payment_terms": {
615
+ "exact": true,
616
+ "pred": "Net 30",
617
+ "gt": "Net 30"
618
+ }
619
+ }
620
+ }
621
+ },
622
+ {
623
+ "doc_id": "po_scanned",
624
+ "predicted_type": "purchase_order",
625
+ "difficulty": "standard",
626
+ "channel": "scanned",
627
+ "confidence": 0.52,
628
+ "requires_review": true,
629
+ "cost_usd": 0.0,
630
+ "score": {
631
+ "doc_type": "purchase_order",
632
+ "exact_match": 0.818,
633
+ "field_f1": 0.909,
634
+ "precision": 0.938,
635
+ "recall": 0.882,
636
+ "fields_scored": 11,
637
+ "line_item_f1": 0.0,
638
+ "line_items_applicable": true,
639
+ "financial_consistent": true,
640
+ "per_field": {
641
+ "order_number": {
642
+ "exact": false,
643
+ "pred": "Purchase",
644
+ "gt": "PO-100483"
645
+ },
646
+ "order_date": {
647
+ "exact": true,
648
+ "pred": "2026-04-11",
649
+ "gt": "2026-04-11"
650
+ },
651
+ "delivery_date": {
652
+ "exact": true,
653
+ "pred": "2026-05-01",
654
+ "gt": "2026-05-01"
655
+ },
656
+ "vendor_name": {
657
+ "exact": true,
658
+ "pred": "Initech Supplies",
659
+ "gt": "Initech Supplies"
660
+ },
661
+ "buyer_name": {
662
+ "exact": true,
663
+ "pred": "Contoso Ops",
664
+ "gt": "Contoso Ops"
665
+ },
666
+ "ship_to": {
667
+ "exact": true,
668
+ "pred": "9 Market St, Metropolis",
669
+ "gt": "9 Market St, Metropolis"
670
+ },
671
+ "currency": {
672
+ "exact": true,
673
+ "pred": "USD",
674
+ "gt": "USD"
675
+ },
676
+ "subtotal": {
677
+ "exact": true,
678
+ "pred": 900.0,
679
+ "gt": 900.0
680
+ },
681
+ "tax_amount": {
682
+ "exact": true,
683
+ "pred": 80.0,
684
+ "gt": 80.0
685
+ },
686
+ "total": {
687
+ "exact": false,
688
+ "pred": null,
689
+ "gt": 980.0
690
+ },
691
+ "payment_terms": {
692
+ "exact": true,
693
+ "pred": "Net 15",
694
+ "gt": "Net 15"
695
+ }
696
+ }
697
+ }
698
+ },
699
+ {
700
+ "doc_id": "receipt_digital",
701
+ "predicted_type": "receipt",
702
+ "difficulty": "standard",
703
+ "channel": "digital",
704
+ "confidence": 1.0,
705
+ "requires_review": false,
706
+ "cost_usd": 0.0,
707
+ "score": {
708
+ "doc_type": "receipt",
709
+ "exact_match": 1.0,
710
+ "field_f1": 1.0,
711
+ "precision": 1.0,
712
+ "recall": 1.0,
713
+ "fields_scored": 7,
714
+ "line_item_f1": 1.0,
715
+ "line_items_applicable": true,
716
+ "financial_consistent": true,
717
+ "per_field": {
718
+ "merchant": {
719
+ "exact": true,
720
+ "pred": "City Hardware",
721
+ "gt": "City Hardware"
722
+ },
723
+ "date": {
724
+ "exact": true,
725
+ "pred": "2026-06-18",
726
+ "gt": "2026-06-18"
727
+ },
728
+ "currency": {
729
+ "exact": true,
730
+ "pred": "USD",
731
+ "gt": "USD"
732
+ },
733
+ "subtotal": {
734
+ "exact": true,
735
+ "pred": 47.0,
736
+ "gt": 47.0
737
+ },
738
+ "tax_amount": {
739
+ "exact": true,
740
+ "pred": 3.76,
741
+ "gt": 3.76
742
+ },
743
+ "total": {
744
+ "exact": true,
745
+ "pred": 50.76,
746
+ "gt": 50.76
747
+ },
748
+ "payment_method": {
749
+ "exact": true,
750
+ "pred": "Visa card ending 4242",
751
+ "gt": "Visa card ending 4242"
752
+ }
753
+ }
754
+ }
755
+ },
756
+ {
757
+ "doc_id": "receipt_scanned",
758
+ "predicted_type": "receipt",
759
+ "difficulty": "standard",
760
+ "channel": "scanned",
761
+ "confidence": 0.8,
762
+ "requires_review": true,
763
+ "cost_usd": 0.0,
764
+ "score": {
765
+ "doc_type": "receipt",
766
+ "exact_match": 1.0,
767
+ "field_f1": 1.0,
768
+ "precision": 1.0,
769
+ "recall": 1.0,
770
+ "fields_scored": 7,
771
+ "line_item_f1": 0.0,
772
+ "line_items_applicable": true,
773
+ "financial_consistent": true,
774
+ "per_field": {
775
+ "merchant": {
776
+ "exact": true,
777
+ "pred": "QuickMart",
778
+ "gt": "QuickMart"
779
+ },
780
+ "date": {
781
+ "exact": true,
782
+ "pred": "2026-05-30",
783
+ "gt": "2026-05-30"
784
+ },
785
+ "currency": {
786
+ "exact": true,
787
+ "pred": "USD",
788
+ "gt": "USD"
789
+ },
790
+ "subtotal": {
791
+ "exact": true,
792
+ "pred": 23.5,
793
+ "gt": 23.5
794
+ },
795
+ "tax_amount": {
796
+ "exact": true,
797
+ "pred": 1.88,
798
+ "gt": 1.88
799
+ },
800
+ "total": {
801
+ "exact": true,
802
+ "pred": 25.38,
803
+ "gt": 25.38
804
+ },
805
+ "payment_method": {
806
+ "exact": true,
807
+ "pred": "Cash",
808
+ "gt": "Cash"
809
+ }
810
+ }
811
+ }
812
+ },
813
+ {
814
+ "doc_id": "subscription_memo_pos",
815
+ "predicted_type": "subscription_memo",
816
+ "difficulty": "standard",
817
+ "channel": "digital",
818
+ "confidence": 1.0,
819
+ "requires_review": false,
820
+ "cost_usd": 0.0,
821
+ "score": {
822
+ "doc_type": "subscription_memo",
823
+ "exact_match": 0.917,
824
+ "field_f1": 0.875,
825
+ "precision": 0.933,
826
+ "recall": 0.824,
827
+ "fields_scored": 12,
828
+ "line_item_f1": null,
829
+ "line_items_applicable": false,
830
+ "financial_consistent": true,
831
+ "per_field": {
832
+ "memo_number": {
833
+ "exact": true,
834
+ "pred": "SUB-2026-0091",
835
+ "gt": "SUB-2026-0091"
836
+ },
837
+ "subscription_name": {
838
+ "exact": false,
839
+ "pred": "MEMO",
840
+ "gt": "POS Cloud Platform"
841
+ },
842
+ "vendor_name": {
843
+ "exact": true,
844
+ "pred": "Initech Supplies",
845
+ "gt": "Initech Supplies"
846
+ },
847
+ "account_id": {
848
+ "exact": true,
849
+ "pred": "ACC-55821",
850
+ "gt": "ACC-55821"
851
+ },
852
+ "plan": {
853
+ "exact": true,
854
+ "pred": "Enterprise (500 lanes)",
855
+ "gt": "Enterprise (500 lanes)"
856
+ },
857
+ "billing_cycle": {
858
+ "exact": true,
859
+ "pred": "annual",
860
+ "gt": "annual"
861
+ },
862
+ "start_date": {
863
+ "exact": true,
864
+ "pred": "2025-08-01",
865
+ "gt": "2025-08-01"
866
+ },
867
+ "renewal_date": {
868
+ "exact": true,
869
+ "pred": "2026-08-01",
870
+ "gt": "2026-08-01"
871
+ },
872
+ "amount": {
873
+ "exact": true,
874
+ "pred": 84000.0,
875
+ "gt": 84000.0
876
+ },
877
+ "currency": {
878
+ "exact": true,
879
+ "pred": "USD",
880
+ "gt": "USD"
881
+ },
882
+ "auto_renew": {
883
+ "exact": true,
884
+ "pred": true,
885
+ "gt": true
886
+ },
887
+ "status": {
888
+ "exact": true,
889
+ "pred": "active",
890
+ "gt": "active"
891
+ }
892
+ }
893
+ }
894
+ },
895
+ {
896
+ "doc_id": "subscription_memo_scanned",
897
+ "predicted_type": "subscription_memo",
898
+ "difficulty": "standard",
899
+ "channel": "scanned",
900
+ "confidence": 0.8,
901
+ "requires_review": true,
902
+ "cost_usd": 0.0,
903
+ "score": {
904
+ "doc_type": "subscription_memo",
905
+ "exact_match": 0.833,
906
+ "field_f1": 0.759,
907
+ "precision": 0.846,
908
+ "recall": 0.688,
909
+ "fields_scored": 12,
910
+ "line_item_f1": null,
911
+ "line_items_applicable": false,
912
+ "financial_consistent": true,
913
+ "per_field": {
914
+ "memo_number": {
915
+ "exact": true,
916
+ "pred": "SUB-2026-0145",
917
+ "gt": "SUB-2026-0145"
918
+ },
919
+ "subscription_name": {
920
+ "exact": false,
921
+ "pred": "MEMO",
922
+ "gt": "Store Wi-Fi & Analytics"
923
+ },
924
+ "vendor_name": {
925
+ "exact": true,
926
+ "pred": "GlobalParts GmbH",
927
+ "gt": "GlobalParts GmbH"
928
+ },
929
+ "account_id": {
930
+ "exact": true,
931
+ "pred": "ACC-77310",
932
+ "gt": "ACC-77310"
933
+ },
934
+ "plan": {
935
+ "exact": true,
936
+ "pred": "Standard",
937
+ "gt": "Standard"
938
+ },
939
+ "billing_cycle": {
940
+ "exact": true,
941
+ "pred": "monthly",
942
+ "gt": "monthly"
943
+ },
944
+ "start_date": {
945
+ "exact": true,
946
+ "pred": "2026-01-15",
947
+ "gt": "2026-01-15"
948
+ },
949
+ "renewal_date": {
950
+ "exact": true,
951
+ "pred": "2026-07-15",
952
+ "gt": "2026-07-15"
953
+ },
954
+ "amount": {
955
+ "exact": true,
956
+ "pred": 2500.0,
957
+ "gt": 2500.0
958
+ },
959
+ "currency": {
960
+ "exact": true,
961
+ "pred": "EUR",
962
+ "gt": "EUR"
963
+ },
964
+ "auto_renew": {
965
+ "exact": false,
966
+ "pred": true,
967
+ "gt": false
968
+ },
969
+ "status": {
970
+ "exact": true,
971
+ "pred": "pending",
972
+ "gt": "pending"
973
+ }
974
+ }
975
+ }
976
+ }
977
+ ],
978
+ "routing_policy": "offline",
979
+ "active_tier": "offline"
980
+ }
backend/evals/run.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Eval runner.
2
+
3
+ Discovers <id>.gt.json files, runs the IDP pipeline on each paired document,
4
+ scores the prediction, and prints a per-type/per-difficulty report. Also writes
5
+ backend/evals/report.json and records rows in the metrics DB (mode='eval') so the
6
+ dashboard's Evals tab renders the same numbers.
7
+
8
+ Usage:
9
+ python -m evals.run # full suite (configured router)
10
+ python -m evals.run --type invoice # filter by doc type
11
+ python -m evals.run --policy offline # force a routing policy
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ # allow `python -m evals.run` from backend/ and `python evals/run.py`
21
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
22
+
23
+ from app.config import get_settings # noqa: E402
24
+ from app.metrics import MetricsStore # noqa: E402
25
+ from app.pipeline import process_document # noqa: E402
26
+ from app.providers import build_registry # noqa: E402
27
+ from app.router import ModelRouter # noqa: E402
28
+ from evals import scorers # noqa: E402
29
+
30
+ DOC_EXTS = (".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff")
31
+
32
+
33
+ def discover(dataset_dir: Path, type_filter: str | None) -> list[tuple[Path, dict]]:
34
+ out = []
35
+ for gt_path in sorted(dataset_dir.glob("*.gt.json")):
36
+ gt = json.loads(gt_path.read_text())
37
+ if gt.get("_meta", {}).get("skip_eval"):
38
+ continue # showcase-only docs (e.g. the complex form) aren't scored here
39
+ if type_filter and gt.get("doc_type") != type_filter:
40
+ continue
41
+ stem = gt_path.name[: -len(".gt.json")]
42
+ doc = None
43
+ for ext in DOC_EXTS:
44
+ cand = dataset_dir / f"{stem}{ext}"
45
+ if cand.exists():
46
+ doc = cand
47
+ break
48
+ if doc is None:
49
+ print(f" ! no document file for {stem}, skipping")
50
+ continue
51
+ out.append((doc, gt))
52
+ return out
53
+
54
+
55
+ def run_suite(type_filter: str | None = None, policy: str | None = None) -> dict:
56
+ settings = get_settings()
57
+ if policy:
58
+ settings.routing_policy = policy
59
+ registry = build_registry(settings)
60
+ metrics = MetricsStore(settings.metrics_db_path)
61
+ router = ModelRouter(registry, settings, metrics)
62
+
63
+ cases = discover(settings.evals_dataset_dir, type_filter)
64
+ if not cases:
65
+ print("No eval cases found. Run: python scripts/generate_samples.py")
66
+ return {}
67
+
68
+ results = []
69
+ for doc_path, gt in cases:
70
+ meta = gt.get("_meta", {})
71
+ clean_gt = {k: v for k, v in gt.items() if not k.startswith("_")}
72
+ run = process_document(
73
+ doc_path, router=router, settings=settings, metrics=metrics,
74
+ doc_id=doc_path.stem, channel=meta.get("channel"),
75
+ difficulty=meta.get("difficulty"), mode="eval",
76
+ # let the classifier do its job; do NOT force the type (we score it)
77
+ )
78
+ pred = run["_state"]["extracted"] or {}
79
+ score = scorers.score_document(pred, clean_gt)
80
+ results.append({
81
+ "doc_id": doc_path.stem,
82
+ "predicted_type": run["_state"]["doc_type"],
83
+ "difficulty": meta.get("difficulty", "n/a"),
84
+ "channel": meta.get("channel", "n/a"),
85
+ "confidence": run["_state"]["confidence"],
86
+ "requires_review": run["_state"]["requires_review"],
87
+ "cost_usd": run["total_cost_usd"],
88
+ "score": score,
89
+ })
90
+ agg = scorers.aggregate(results)
91
+ report = {"aggregate": agg, "documents": results,
92
+ "routing_policy": settings.routing_policy,
93
+ "active_tier": registry.capabilities()["active_tier"]}
94
+ # Write to a writable location (committed copy locally, /tmp on serverless).
95
+ for out_path in (settings.eval_report_committed, settings.eval_report_writable):
96
+ try:
97
+ out_path.write_text(json.dumps(report, indent=2))
98
+ break
99
+ except OSError:
100
+ continue
101
+ return report
102
+
103
+
104
+ def _print(report: dict) -> None:
105
+ if not report:
106
+ return
107
+ agg = report["aggregate"]
108
+ o = agg["overall"]
109
+ print("\n" + "=" * 64)
110
+ print(f" IDP EVAL REPORT (tier={report['active_tier']}, policy={report['routing_policy']})")
111
+ print("=" * 64)
112
+ print(f" documents: {o['documents']}")
113
+ print(f" doc-type accuracy: {_pct(o['doc_type_accuracy'])}")
114
+ print(f" field exact-match: {_pct(o['exact_match'])}")
115
+ print(f" field F1: {_pct(o['field_f1'])}")
116
+ print(f" line-item F1: {_pct(o['line_item_f1'])}")
117
+ print(f" financial consistency:{_pct(o['financial_consistency_rate'])}")
118
+ print("-" * 64)
119
+ print(f" {'by type':<18}{'docs':>5}{'exact':>9}{'F1':>9}{'fin-ok':>9}")
120
+ for t, g in agg["by_type"].items():
121
+ print(f" {t:<18}{g['documents']:>5}{_pct(g['exact_match']):>9}"
122
+ f"{_pct(g['field_f1']):>9}{_pct(g['financial_consistency_rate']):>9}")
123
+ print("-" * 64)
124
+ print(f" {'by difficulty':<18}{'docs':>5}{'exact':>9}{'F1':>9}{'fin-ok':>9}")
125
+ for d, g in agg["by_difficulty"].items():
126
+ print(f" {d:<18}{g['documents']:>5}{_pct(g['exact_match']):>9}"
127
+ f"{_pct(g['field_f1']):>9}{_pct(g['financial_consistency_rate']):>9}")
128
+ print("=" * 64)
129
+ print(f" report → backend/evals/report.json\n")
130
+
131
+
132
+ def _pct(v) -> str:
133
+ return "n/a" if v is None else f"{v*100:.1f}%"
134
+
135
+
136
+ def main() -> None:
137
+ ap = argparse.ArgumentParser(description="Run the IDP eval suite.")
138
+ ap.add_argument("--type", dest="type_filter", default=None)
139
+ ap.add_argument("--policy", dest="policy", default=None,
140
+ choices=["auto", "cheap", "smart", "offline"])
141
+ args = ap.parse_args()
142
+ report = run_suite(args.type_filter, args.policy)
143
+ _print(report)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
backend/evals/scorers.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scorers: turn (prediction, ground_truth) into the metrics in docs/EVALS.md."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+ # Fields that are scalars we compare for exact match (per doc type).
8
+ SCALAR_FIELDS = {
9
+ "invoice": ["invoice_number", "issue_date", "due_date", "vendor_name",
10
+ "bill_to_name", "currency", "subtotal", "tax_amount", "total"],
11
+ "purchase_order": ["order_number", "order_date", "delivery_date", "vendor_name",
12
+ "buyer_name", "ship_to", "currency", "subtotal", "tax_amount",
13
+ "total", "payment_terms"],
14
+ "contract": ["contract_number", "title", "party_a", "party_b", "effective_date",
15
+ "expiration_date", "contract_value", "currency", "governing_law",
16
+ "auto_renew", "termination_notice_days"],
17
+ "receipt": ["merchant", "date", "currency", "subtotal", "tax_amount", "total",
18
+ "payment_method"],
19
+ "subscription_memo": ["memo_number", "subscription_name", "vendor_name", "account_id",
20
+ "plan", "billing_cycle", "start_date", "renewal_date", "amount",
21
+ "currency", "auto_renew", "status"],
22
+ }
23
+
24
+
25
+ def _norm_scalar(v: Any) -> str:
26
+ if v is None:
27
+ return ""
28
+ if isinstance(v, bool):
29
+ return str(v).lower()
30
+ if isinstance(v, (int, float)):
31
+ return f"{float(v):.2f}"
32
+ s = str(v).strip().lower()
33
+ s = re.sub(r"[\s,]+", " ", s)
34
+ # strip currency symbols for value comparison
35
+ s = s.replace("$", "").replace("€", "").replace("£", "")
36
+ return s.strip()
37
+
38
+
39
+ def _tokens(v: Any) -> list[str]:
40
+ return [t for t in re.split(r"\s+", _norm_scalar(v)) if t]
41
+
42
+
43
+ def field_scores(pred: dict, gt: dict, doc_type: str) -> dict:
44
+ """Per-field exact match + aggregate token F1 over scalar fields."""
45
+ fields = [f for f in SCALAR_FIELDS.get(doc_type, []) if f in gt]
46
+ exact = 0
47
+ tp = fp = fn = 0
48
+ per_field = {}
49
+ for f in fields:
50
+ p, g = pred.get(f), gt.get(f)
51
+ is_exact = _norm_scalar(p) == _norm_scalar(g) and _norm_scalar(g) != ""
52
+ if _norm_scalar(g) == "": # gt empty/None — skip from denominator
53
+ continue
54
+ exact += int(is_exact)
55
+ per_field[f] = {"exact": is_exact, "pred": p, "gt": g}
56
+ pt, gtok = set(_tokens(p)), set(_tokens(g))
57
+ tp += len(pt & gtok)
58
+ fp += len(pt - gtok)
59
+ fn += len(gtok - pt)
60
+ n = len([f for f in fields if _norm_scalar(gt.get(f)) != ""])
61
+ prec = tp / (tp + fp) if (tp + fp) else 0.0
62
+ rec = tp / (tp + fn) if (tp + fn) else 0.0
63
+ f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
64
+ return {
65
+ "fields_scored": n,
66
+ "exact_match": round(exact / n, 3) if n else 0.0,
67
+ "f1": round(f1, 3),
68
+ "precision": round(prec, 3),
69
+ "recall": round(rec, 3),
70
+ "per_field": per_field,
71
+ }
72
+
73
+
74
+ def line_item_f1(pred: dict, gt: dict) -> dict:
75
+ gi = gt.get("line_items") or []
76
+ pi = pred.get("line_items") or []
77
+ if not gi:
78
+ return {"applicable": False, "f1": None}
79
+
80
+ def key(it):
81
+ return (
82
+ _norm_scalar(it.get("description"))[:20],
83
+ f"{float(it.get('quantity', 0) or 0):.1f}",
84
+ f"{float(it.get('unit_price', 0) or 0):.2f}",
85
+ )
86
+
87
+ gset = [key(x) for x in gi]
88
+ pset = [key(x) for x in pi]
89
+ matched = 0
90
+ gpool = list(gset)
91
+ for k in pset:
92
+ if k in gpool:
93
+ matched += 1
94
+ gpool.remove(k)
95
+ prec = matched / len(pset) if pset else 0.0
96
+ rec = matched / len(gset) if gset else 0.0
97
+ f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
98
+ return {"applicable": True, "f1": round(f1, 3), "matched": matched,
99
+ "pred_n": len(pset), "gt_n": len(gset)}
100
+
101
+
102
+ def financial_consistency(pred: dict, doc_type: str) -> bool:
103
+ from app.schemas import validate_financials
104
+
105
+ vr = validate_financials(pred, doc_type)
106
+ return vr.checks.get("totals_balance", True) and vr.checks.get("line_items_sum", True)
107
+
108
+
109
+ def score_document(pred: dict, gt: dict) -> dict:
110
+ doc_type = gt.get("doc_type") or gt.get("_meta", {}).get("doc_type", "invoice")
111
+ fs = field_scores(pred, gt, doc_type)
112
+ li = line_item_f1(pred, gt)
113
+ fin = financial_consistency(pred, doc_type)
114
+ return {
115
+ "doc_type": doc_type,
116
+ "exact_match": fs["exact_match"],
117
+ "field_f1": fs["f1"],
118
+ "precision": fs["precision"],
119
+ "recall": fs["recall"],
120
+ "fields_scored": fs["fields_scored"],
121
+ "line_item_f1": li["f1"],
122
+ "line_items_applicable": li["applicable"],
123
+ "financial_consistent": fin,
124
+ "per_field": fs["per_field"],
125
+ }
126
+
127
+
128
+ def aggregate(results: list[dict]) -> dict:
129
+ """Aggregate per-document scores overall + by type + by difficulty."""
130
+ def avg(vals):
131
+ vals = [v for v in vals if v is not None]
132
+ return round(sum(vals) / len(vals), 3) if vals else None
133
+
134
+ overall = {
135
+ "documents": len(results),
136
+ "exact_match": avg([r["score"]["exact_match"] for r in results]),
137
+ "field_f1": avg([r["score"]["field_f1"] for r in results]),
138
+ "line_item_f1": avg([r["score"]["line_item_f1"] for r in results
139
+ if r["score"]["line_items_applicable"]]),
140
+ "financial_consistency_rate": avg(
141
+ [1.0 if r["score"]["financial_consistent"] else 0.0 for r in results]
142
+ ),
143
+ "doc_type_accuracy": avg(
144
+ [1.0 if r["predicted_type"] == r["score"]["doc_type"] else 0.0 for r in results]
145
+ ),
146
+ }
147
+ by_type: dict[str, list] = {}
148
+ by_diff: dict[str, list] = {}
149
+ for r in results:
150
+ by_type.setdefault(r["score"]["doc_type"], []).append(r)
151
+ by_diff.setdefault(r.get("difficulty", "n/a"), []).append(r)
152
+
153
+ def group(g):
154
+ return {
155
+ "documents": len(g),
156
+ "exact_match": avg([x["score"]["exact_match"] for x in g]),
157
+ "field_f1": avg([x["score"]["field_f1"] for x in g]),
158
+ "financial_consistency_rate": avg(
159
+ [1.0 if x["score"]["financial_consistent"] else 0.0 for x in g]),
160
+ }
161
+
162
+ return {
163
+ "overall": overall,
164
+ "by_type": {k: group(v) for k, v in by_type.items()},
165
+ "by_difficulty": {k: group(v) for k, v in by_diff.items()},
166
+ }
backend/finetune/erp_finetune_report.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 400,
11
+ "before_test_accuracy": 0.083,
12
+ "after_test_accuracy": 0.917,
13
+ "accuracy_gain": 0.833,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3042,
17
+ 2.1773,
18
+ 2.0612,
19
+ 1.9523,
20
+ 1.8493,
21
+ 1.752,
22
+ 1.6601,
23
+ 1.5736,
24
+ 1.4922,
25
+ 1.4159,
26
+ 1.3444,
27
+ 1.2776,
28
+ 1.2152,
29
+ 1.1569,
30
+ 1.1026,
31
+ 1.052,
32
+ 1.0048,
33
+ 0.9607,
34
+ 0.9197,
35
+ 0.8813,
36
+ 0.8456,
37
+ 0.8122,
38
+ 0.7809,
39
+ 0.7517,
40
+ 0.7243,
41
+ 0.6987,
42
+ 0.6746,
43
+ 0.6521,
44
+ 0.6308,
45
+ 0.6109,
46
+ 0.5921,
47
+ 0.5744,
48
+ 0.5577,
49
+ 0.542,
50
+ 0.5271,
51
+ 0.513,
52
+ 0.4997,
53
+ 0.4871,
54
+ 0.4752,
55
+ 0.4638
56
+ ],
57
+ "final_loss": 0.4541,
58
+ "labels": [
59
+ "spend_by_month",
60
+ "top_vendors",
61
+ "late_vendors",
62
+ "late_rate",
63
+ "spend_by_category",
64
+ "why_q2",
65
+ "below_reorder",
66
+ "open_invoices",
67
+ "return_reasons",
68
+ "ap_health"
69
+ ],
70
+ "backend": "local",
71
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
72
+ "production_recipe": {
73
+ "base_model": "openbmb/MiniCPM3-4B",
74
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
75
+ "dataset": "backend/finetune/erp_sft.jsonl",
76
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
77
+ "hyperparams": {
78
+ "lora_r": 16,
79
+ "lora_alpha": 32,
80
+ "lora_dropout": 0.05,
81
+ "target_modules": [
82
+ "q_proj",
83
+ "k_proj",
84
+ "v_proj",
85
+ "o_proj"
86
+ ],
87
+ "learning_rate": 0.0002,
88
+ "num_train_epochs": 3,
89
+ "per_device_train_batch_size": 8,
90
+ "gradient_accumulation_steps": 2,
91
+ "max_seq_length": 1024,
92
+ "bf16": true
93
+ },
94
+ "command": "python scripts/finetune_erp.py --backend hf",
95
+ "requirements": [
96
+ "torch",
97
+ "transformers>=4.44",
98
+ "peft",
99
+ "trl",
100
+ "accelerate",
101
+ "datasets"
102
+ ]
103
+ },
104
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
105
+ "generated_at": 1781324653.616286
106
+ }
backend/finetune/erp_sft.jsonl ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why is Q2 spend up?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
2
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize our AP position.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
3
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Vendors most often paid after due date?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
4
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much do we owe in unpaid invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
5
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much did we invoice each month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
6
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend grouped by category.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
7
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What drove the Q2 spend increase?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
8
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which products are low on stock?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
9
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "State of our AP overall?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
10
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What was total invoiced spend by month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
11
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show monthly spend.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
12
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Categories ranked by spend.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
13
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Where does spend go by category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
14
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total open AP balance?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
15
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Share of late payments?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
16
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Rank vendors by spend.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
17
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overall on-time vs late rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
18
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List our biggest suppliers.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
19
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who are our worst late-paying vendors?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
20
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Return reason breakdown by money?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
21
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What should we reorder?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
22
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which items risk stockout?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
23
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors get the most of our money?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
24
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What are the leading causes of returns?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
25
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show spend by category.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
26
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Late-payment ratio across all invoices?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
27
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's running low in inventory?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
28
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Biggest return reasons by refund value?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
29
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How healthy are our payables?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
30
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers have payment delays?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
31
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which return reasons cost the most?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
32
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break spend down per month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
33
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Category spend totals?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
34
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Percentage of overdue payments overall?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
35
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "AP health check please.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
36
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which categories cost the most?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
37
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Monthly AP spend totals?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
38
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What caused the second-quarter cost jump?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
39
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who keeps getting paid past terms?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
40
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Account for the Q2 2026 spend surge.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
41
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's the state of accounts payable?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
42
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Our overall late payment percentage?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
43
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Inventory below reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
44
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What is the total value of open invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
45
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top suppliers ranked by spend.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
46
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List vendors by late-payment count.", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
47
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Late payers among our vendors?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
48
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total spend for each category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
49
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What is the late-payment rate overall?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
50
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How are we doing on accounts payable?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
51
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show returns grouped by reason.", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
52
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Explain the spend spike in Q2.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
53
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our late payment rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
54
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break down returns by reason.", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
55
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Outstanding invoice value?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
56
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List SKUs below reorder threshold.", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
57
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give an executive AP summary.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
58
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total of invoices not yet paid?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
59
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why did spend rise in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
60
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our outstanding payables total?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
61
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Worst offenders for late payment?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
62
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Our highest-spend vendors?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
63
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much do we spend per product category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
64
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why are products being returned?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
65
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Reason for higher spending in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
66
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Vendors with the most overdue payments?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
67
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much is still open in payables?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
68
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Return reasons ranked by refunds?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
69
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overall accounts payable summary?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
70
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give the global late payment percentage.", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
71
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors paid late most often?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
72
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors do we spend the most with?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
73
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Largest vendors please.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
74
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "SKUs needing replenishment?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
75
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How often do we pay late, as a rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
76
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Value of unpaid invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
77
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much AP is still open?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
78
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Drivers of the Q2 spend rise?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
79
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show the five largest vendors.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
80
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What drives our refunds?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
81
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What percent of invoices are paid late?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
82
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers cost us the most?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
83
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Explain why Q2 spend went up.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
84
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend by period please.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
85
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Open invoice liability?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
86
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overview of payables health?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
87
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize payables status.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
88
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Sum of open invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
89
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What needs replenishing?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
90
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top return reasons by refund amount?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
91
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give me an AP health overview.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
92
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Unpaid invoice amount overall?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
93
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Stock positions under reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
94
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show vendors with frequent late payments.", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
95
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top vendors by total spend?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
96
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which SKUs are below reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
97
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize accounts payable health.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
98
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend per category please.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
99
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why did costs climb in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
100
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Most costly return reasons?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
101
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors are habitually overdue?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
102
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our spend over the months?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
103
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break down spend across categories.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
104
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our category spend mix?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
105
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How has spend trended month to month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
106
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Monthly invoiced spend trend?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
107
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers do we pay late?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
108
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why was Q2 so expensive?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
109
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total spend grouped by month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
110
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show items under their reorder level.", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
111
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How bad is our late-payment rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
112
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who are the top 5 vendors by spend?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
113
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's behind the Q2 increase?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
114
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Refund totals per return reason?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
115
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Plot spend per month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
116
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give me the month-by-month spend.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
117
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which products fell below reorder?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
118
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Category-level spend breakdown?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
119
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Biggest vendors by invoice value?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
120
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What fraction of payments miss terms?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
backend/finetune/runs/hf_20260612T212346.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "hf",
3
+ "ran": false,
4
+ "reason": "training stack unavailable (No module named 'torch')",
5
+ "recipe": {
6
+ "base_model": "openbmb/MiniCPM3-4B",
7
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
8
+ "dataset": "backend/finetune/erp_sft.jsonl",
9
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
10
+ "hyperparams": {
11
+ "lora_r": 16,
12
+ "lora_alpha": 32,
13
+ "lora_dropout": 0.05,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj"
19
+ ],
20
+ "learning_rate": 0.0002,
21
+ "num_train_epochs": 3,
22
+ "per_device_train_batch_size": 8,
23
+ "gradient_accumulation_steps": 2,
24
+ "max_seq_length": 1024,
25
+ "bf16": true
26
+ },
27
+ "command": "python scripts/finetune_erp.py --backend hf",
28
+ "requirements": [
29
+ "torch",
30
+ "transformers>=4.44",
31
+ "peft",
32
+ "trl",
33
+ "accelerate",
34
+ "datasets"
35
+ ]
36
+ },
37
+ "note": "Dataset + recipe are ready; launch on a GPU box to fine-tune MiniCPM3-4B.",
38
+ "offline_demo": {
39
+ "kind": "offline-domain-adaptation",
40
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
41
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
42
+ "dataset_size": 120,
43
+ "train": 96,
44
+ "test": 24,
45
+ "n_classes": 10,
46
+ "trainable_params": 40970,
47
+ "epochs": 50,
48
+ "before_test_accuracy": 0.083,
49
+ "after_test_accuracy": 0.583,
50
+ "accuracy_gain": 0.5,
51
+ "routed_sql_exec_rate": 1.0,
52
+ "loss_curve": [
53
+ 2.3042,
54
+ 2.2908,
55
+ 2.2776,
56
+ 2.2646,
57
+ 2.2517,
58
+ 2.239,
59
+ 2.2264,
60
+ 2.214,
61
+ 2.2016,
62
+ 2.1894,
63
+ 2.1773,
64
+ 2.1653,
65
+ 2.1534,
66
+ 2.1416,
67
+ 2.1299,
68
+ 2.1183,
69
+ 2.1067,
70
+ 2.0952,
71
+ 2.0838,
72
+ 2.0725,
73
+ 2.0612,
74
+ 2.0501,
75
+ 2.0389,
76
+ 2.0279,
77
+ 2.0169,
78
+ 2.006,
79
+ 1.9951,
80
+ 1.9843,
81
+ 1.9736,
82
+ 1.9629,
83
+ 1.9523,
84
+ 1.9417,
85
+ 1.9312,
86
+ 1.9208,
87
+ 1.9104,
88
+ 1.9001,
89
+ 1.8898,
90
+ 1.8796,
91
+ 1.8695,
92
+ 1.8594,
93
+ 1.8493,
94
+ 1.8394,
95
+ 1.8294,
96
+ 1.8196,
97
+ 1.8097,
98
+ 1.8,
99
+ 1.7903,
100
+ 1.7806,
101
+ 1.771,
102
+ 1.7615
103
+ ],
104
+ "final_loss": 1.7615,
105
+ "labels": [
106
+ "spend_by_month",
107
+ "top_vendors",
108
+ "late_vendors",
109
+ "late_rate",
110
+ "spend_by_category",
111
+ "why_q2",
112
+ "below_reorder",
113
+ "open_invoices",
114
+ "return_reasons",
115
+ "ap_health"
116
+ ]
117
+ },
118
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
119
+ "generated_at": 1781324626.2022731
120
+ }
backend/finetune/runs/local_20260612T212257.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 250,
11
+ "before_test_accuracy": 0.0,
12
+ "after_test_accuracy": 0.875,
13
+ "accuracy_gain": 0.875,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3026,
17
+ 2.2249,
18
+ 2.1519,
19
+ 2.0824,
20
+ 2.0155,
21
+ 1.9509,
22
+ 1.8884,
23
+ 1.828,
24
+ 1.7697,
25
+ 1.7132,
26
+ 1.6588,
27
+ 1.6062,
28
+ 1.5555,
29
+ 1.5067,
30
+ 1.4598,
31
+ 1.4146,
32
+ 1.3711,
33
+ 1.3294,
34
+ 1.2893,
35
+ 1.2509,
36
+ 1.2139,
37
+ 1.1785,
38
+ 1.1445,
39
+ 1.112,
40
+ 1.0807,
41
+ 1.0508,
42
+ 1.0221,
43
+ 0.9945,
44
+ 0.9681,
45
+ 0.9428,
46
+ 0.9185,
47
+ 0.8952,
48
+ 0.8729,
49
+ 0.8514,
50
+ 0.8308,
51
+ 0.8111,
52
+ 0.7921,
53
+ 0.7738,
54
+ 0.7563,
55
+ 0.7395,
56
+ 0.7233,
57
+ 0.7077
58
+ ],
59
+ "final_loss": 0.7002,
60
+ "labels": [
61
+ "spend_by_month",
62
+ "top_vendors",
63
+ "late_vendors",
64
+ "late_rate",
65
+ "spend_by_category",
66
+ "why_q2",
67
+ "below_reorder",
68
+ "open_invoices",
69
+ "return_reasons",
70
+ "ap_health"
71
+ ],
72
+ "backend": "local",
73
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
74
+ "production_recipe": {
75
+ "base_model": "openbmb/MiniCPM3-4B",
76
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
77
+ "dataset": "backend/finetune/erp_sft.jsonl",
78
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
79
+ "hyperparams": {
80
+ "lora_r": 16,
81
+ "lora_alpha": 32,
82
+ "lora_dropout": 0.05,
83
+ "target_modules": [
84
+ "q_proj",
85
+ "k_proj",
86
+ "v_proj",
87
+ "o_proj"
88
+ ],
89
+ "learning_rate": 0.0002,
90
+ "num_train_epochs": 3,
91
+ "per_device_train_batch_size": 8,
92
+ "gradient_accumulation_steps": 2,
93
+ "max_seq_length": 1024,
94
+ "bf16": true
95
+ },
96
+ "command": "python scripts/finetune_erp.py --backend hf",
97
+ "requirements": [
98
+ "torch",
99
+ "transformers>=4.44",
100
+ "peft",
101
+ "trl",
102
+ "accelerate",
103
+ "datasets"
104
+ ]
105
+ },
106
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
107
+ "generated_at": 1781324577.1492128
108
+ }
backend/finetune/runs/local_20260612T212332.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 400,
11
+ "before_test_accuracy": 0.083,
12
+ "after_test_accuracy": 0.917,
13
+ "accuracy_gain": 0.833,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3042,
17
+ 2.1773,
18
+ 2.0612,
19
+ 1.9523,
20
+ 1.8493,
21
+ 1.752,
22
+ 1.6601,
23
+ 1.5736,
24
+ 1.4922,
25
+ 1.4159,
26
+ 1.3444,
27
+ 1.2776,
28
+ 1.2152,
29
+ 1.1569,
30
+ 1.1026,
31
+ 1.052,
32
+ 1.0048,
33
+ 0.9607,
34
+ 0.9197,
35
+ 0.8813,
36
+ 0.8456,
37
+ 0.8122,
38
+ 0.7809,
39
+ 0.7517,
40
+ 0.7243,
41
+ 0.6987,
42
+ 0.6746,
43
+ 0.6521,
44
+ 0.6308,
45
+ 0.6109,
46
+ 0.5921,
47
+ 0.5744,
48
+ 0.5577,
49
+ 0.542,
50
+ 0.5271,
51
+ 0.513,
52
+ 0.4997,
53
+ 0.4871,
54
+ 0.4752,
55
+ 0.4638
56
+ ],
57
+ "final_loss": 0.4541,
58
+ "labels": [
59
+ "spend_by_month",
60
+ "top_vendors",
61
+ "late_vendors",
62
+ "late_rate",
63
+ "spend_by_category",
64
+ "why_q2",
65
+ "below_reorder",
66
+ "open_invoices",
67
+ "return_reasons",
68
+ "ap_health"
69
+ ],
70
+ "backend": "local",
71
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
72
+ "production_recipe": {
73
+ "base_model": "openbmb/MiniCPM3-4B",
74
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
75
+ "dataset": "backend/finetune/erp_sft.jsonl",
76
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
77
+ "hyperparams": {
78
+ "lora_r": 16,
79
+ "lora_alpha": 32,
80
+ "lora_dropout": 0.05,
81
+ "target_modules": [
82
+ "q_proj",
83
+ "k_proj",
84
+ "v_proj",
85
+ "o_proj"
86
+ ],
87
+ "learning_rate": 0.0002,
88
+ "num_train_epochs": 3,
89
+ "per_device_train_batch_size": 8,
90
+ "gradient_accumulation_steps": 2,
91
+ "max_seq_length": 1024,
92
+ "bf16": true
93
+ },
94
+ "command": "python scripts/finetune_erp.py --backend hf",
95
+ "requirements": [
96
+ "torch",
97
+ "transformers>=4.44",
98
+ "peft",
99
+ "trl",
100
+ "accelerate",
101
+ "datasets"
102
+ ]
103
+ },
104
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
105
+ "generated_at": 1781324612.290891
106
+ }
backend/finetune/runs/local_20260612T212357.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 250,
11
+ "before_test_accuracy": 0.083,
12
+ "after_test_accuracy": 0.875,
13
+ "accuracy_gain": 0.792,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3042,
17
+ 2.2264,
18
+ 2.1534,
19
+ 2.0838,
20
+ 2.0169,
21
+ 1.9523,
22
+ 1.8898,
23
+ 1.8294,
24
+ 1.771,
25
+ 1.7146,
26
+ 1.6601,
27
+ 1.6076,
28
+ 1.5569,
29
+ 1.5081,
30
+ 1.4611,
31
+ 1.4159,
32
+ 1.3724,
33
+ 1.3307,
34
+ 1.2906,
35
+ 1.2521,
36
+ 1.2152,
37
+ 1.1798,
38
+ 1.1458,
39
+ 1.1132,
40
+ 1.0819,
41
+ 1.052,
42
+ 1.0232,
43
+ 0.9957,
44
+ 0.9693,
45
+ 0.944,
46
+ 0.9197,
47
+ 0.8964,
48
+ 0.874,
49
+ 0.8525,
50
+ 0.8319,
51
+ 0.8122,
52
+ 0.7932,
53
+ 0.7749,
54
+ 0.7574,
55
+ 0.7405,
56
+ 0.7243,
57
+ 0.7087
58
+ ],
59
+ "final_loss": 0.7012,
60
+ "labels": [
61
+ "spend_by_month",
62
+ "top_vendors",
63
+ "late_vendors",
64
+ "late_rate",
65
+ "spend_by_category",
66
+ "why_q2",
67
+ "below_reorder",
68
+ "open_invoices",
69
+ "return_reasons",
70
+ "ap_health"
71
+ ],
72
+ "backend": "local",
73
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
74
+ "production_recipe": {
75
+ "base_model": "openbmb/MiniCPM3-4B",
76
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
77
+ "dataset": "backend/finetune/erp_sft.jsonl",
78
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
79
+ "hyperparams": {
80
+ "lora_r": 16,
81
+ "lora_alpha": 32,
82
+ "lora_dropout": 0.05,
83
+ "target_modules": [
84
+ "q_proj",
85
+ "k_proj",
86
+ "v_proj",
87
+ "o_proj"
88
+ ],
89
+ "learning_rate": 0.0002,
90
+ "num_train_epochs": 3,
91
+ "per_device_train_batch_size": 8,
92
+ "gradient_accumulation_steps": 2,
93
+ "max_seq_length": 1024,
94
+ "bf16": true
95
+ },
96
+ "command": "python scripts/finetune_erp.py --backend hf",
97
+ "requirements": [
98
+ "torch",
99
+ "transformers>=4.44",
100
+ "peft",
101
+ "trl",
102
+ "accelerate",
103
+ "datasets"
104
+ ]
105
+ },
106
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
107
+ "generated_at": 1781324637.309176
108
+ }
backend/finetune/runs/local_20260612T212413.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 400,
11
+ "before_test_accuracy": 0.083,
12
+ "after_test_accuracy": 0.917,
13
+ "accuracy_gain": 0.833,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3042,
17
+ 2.1773,
18
+ 2.0612,
19
+ 1.9523,
20
+ 1.8493,
21
+ 1.752,
22
+ 1.6601,
23
+ 1.5736,
24
+ 1.4922,
25
+ 1.4159,
26
+ 1.3444,
27
+ 1.2776,
28
+ 1.2152,
29
+ 1.1569,
30
+ 1.1026,
31
+ 1.052,
32
+ 1.0048,
33
+ 0.9607,
34
+ 0.9197,
35
+ 0.8813,
36
+ 0.8456,
37
+ 0.8122,
38
+ 0.7809,
39
+ 0.7517,
40
+ 0.7243,
41
+ 0.6987,
42
+ 0.6746,
43
+ 0.6521,
44
+ 0.6308,
45
+ 0.6109,
46
+ 0.5921,
47
+ 0.5744,
48
+ 0.5577,
49
+ 0.542,
50
+ 0.5271,
51
+ 0.513,
52
+ 0.4997,
53
+ 0.4871,
54
+ 0.4752,
55
+ 0.4638
56
+ ],
57
+ "final_loss": 0.4541,
58
+ "labels": [
59
+ "spend_by_month",
60
+ "top_vendors",
61
+ "late_vendors",
62
+ "late_rate",
63
+ "spend_by_category",
64
+ "why_q2",
65
+ "below_reorder",
66
+ "open_invoices",
67
+ "return_reasons",
68
+ "ap_health"
69
+ ],
70
+ "backend": "local",
71
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
72
+ "production_recipe": {
73
+ "base_model": "openbmb/MiniCPM3-4B",
74
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
75
+ "dataset": "backend/finetune/erp_sft.jsonl",
76
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
77
+ "hyperparams": {
78
+ "lora_r": 16,
79
+ "lora_alpha": 32,
80
+ "lora_dropout": 0.05,
81
+ "target_modules": [
82
+ "q_proj",
83
+ "k_proj",
84
+ "v_proj",
85
+ "o_proj"
86
+ ],
87
+ "learning_rate": 0.0002,
88
+ "num_train_epochs": 3,
89
+ "per_device_train_batch_size": 8,
90
+ "gradient_accumulation_steps": 2,
91
+ "max_seq_length": 1024,
92
+ "bf16": true
93
+ },
94
+ "command": "python scripts/finetune_erp.py --backend hf",
95
+ "requirements": [
96
+ "torch",
97
+ "transformers>=4.44",
98
+ "peft",
99
+ "trl",
100
+ "accelerate",
101
+ "datasets"
102
+ ]
103
+ },
104
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
105
+ "generated_at": 1781324653.616286
106
+ }
gradio_app.py CHANGED
@@ -101,6 +101,35 @@ def search(query: str):
101
  for r in RAG.search(query, k=8)]
102
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def run_complex_web_automation():
105
  """Intricate multi-step browser automation: ERP dashboard → Procurement →
106
  +Create Order → read the complex order-form fields."""
@@ -136,6 +165,21 @@ with gr.Blocks(title="Aperture — Retail Document Intelligence") as demo:
136
  kpis = gr.Markdown(_kpis_md())
137
  run_btn.click(run_sample, [sample, backend], [extracted, summary, kpis])
138
  upload_btn.click(run_upload, [upload, backend], [extracted, summary, kpis])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  with gr.Tab("Search (RAG)"):
140
  q = gr.Textbox(label="Query", placeholder="e.g. POS Cloud subscription renewal")
141
  search_btn = gr.Button("🔍 Search")
 
101
  for r in RAG.search(query, k=8)]
102
 
103
 
104
+ def erp_ask(question: str):
105
+ """ERP DocIQ: NLQ / analytics / summary / 'why' over the simulated ERP knowledgebase."""
106
+ from app.erp import ErpChat, get_warehouse
107
+ if not (question or "").strip():
108
+ return "Ask about spend, vendors, late payments, inventory or returns.", []
109
+ chat = ErpChat(S, router=ROUTER, warehouse=get_warehouse(S), metrics=METRICS)
110
+ r = chat.answer(question)
111
+ md = (f"**{r['intent']}** · {r['engine']} · {r['model']} · {r['latency_ms']} ms\n\n"
112
+ f"{r['answer']}\n\n" + (f"```sql\n{r['sql']}\n```" if r.get("sql") else ""))
113
+ rows = r.get("rows") or []
114
+ table = [[*(str(v) for v in row)] for row in rows[:12]] if rows else []
115
+ return md, table
116
+
117
+
118
+ def _erp_finetune_md() -> str:
119
+ import json as _json
120
+ from pathlib import Path
121
+ p = Path(__file__).resolve().parent / "backend" / "finetune" / "erp_finetune_report.json"
122
+ if not p.exists():
123
+ return "_Run `python scripts/finetune_erp.py` to populate fine-tune metrics._"
124
+ d = _json.loads(p.read_text()); od = d.get("offline_demo") or d
125
+ return ("### ERP-domain fine-tuning\n"
126
+ f"- **Production target:** OpenBMB **MiniCPM3-4B** (LoRA recipe emitted)\n"
127
+ f"- **Offline demo (CPU):** before **{od['before_test_accuracy']*100:.1f}%** → "
128
+ f"after **{od['after_test_accuracy']*100:.1f}%** "
129
+ f"(**+{od['accuracy_gain']*100:.0f} pts**) on {od['dataset_size']} examples; "
130
+ f"routed-SQL exec {od['routed_sql_exec_rate']*100:.0f}%")
131
+
132
+
133
  def run_complex_web_automation():
134
  """Intricate multi-step browser automation: ERP dashboard → Procurement →
135
  +Create Order → read the complex order-form fields."""
 
165
  kpis = gr.Markdown(_kpis_md())
166
  run_btn.click(run_sample, [sample, backend], [extracted, summary, kpis])
167
  upload_btn.click(run_upload, [upload, backend], [extracted, summary, kpis])
168
+ with gr.Tab("ERP DocIQ (chat)"):
169
+ gr.Markdown("### Ask your ERP reports — NLQ · analytics · summary · reasons\n"
170
+ "Natural-language questions over a simulated retail ERP (vendors, POs, invoices, "
171
+ "GL, inventory, returns). Figures come from **real SQL**; OpenBMB **MiniCPM3-4B** "
172
+ "phrases summaries & explanations and never invents numbers.")
173
+ erp_q = gr.Textbox(label="Question",
174
+ placeholder="e.g. Why did spend rise in Q2 2026?")
175
+ gr.Examples(["Who are the top 5 vendors by spend?", "What is the late-payment rate overall?",
176
+ "Why did spend rise in Q2 2026?", "Summarize accounts payable health.",
177
+ "Top return reasons by refund amount?"], inputs=erp_q)
178
+ erp_btn = gr.Button("💬 Ask ERP DocIQ", variant="primary")
179
+ erp_answer = gr.Markdown()
180
+ erp_rows = gr.Dataframe(label="Query result (real SQL over the warehouse)")
181
+ erp_btn.click(erp_ask, [erp_q], [erp_answer, erp_rows])
182
+ gr.Markdown(_erp_finetune_md())
183
  with gr.Tab("Search (RAG)"):
184
  q = gr.Textbox(label="Query", placeholder="e.g. POS Cloud subscription renewal")
185
  search_btn = gr.Button("🔍 Search")
results/erp_finetune_report.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kind": "offline-domain-adaptation",
3
+ "model": "ERP-NLQ-router (softmax over hashed n-grams, numpy)",
4
+ "note": "Offline CPU demo of the training loop + eval on the SAME dataset the MiniCPM3-4B LoRA recipe consumes. Trains the NLQ routing head that sits in front of the small model; production fine-tune = OpenBMB MiniCPM3-4B LoRA.",
5
+ "dataset_size": 120,
6
+ "train": 96,
7
+ "test": 24,
8
+ "n_classes": 10,
9
+ "trainable_params": 40970,
10
+ "epochs": 400,
11
+ "before_test_accuracy": 0.083,
12
+ "after_test_accuracy": 0.917,
13
+ "accuracy_gain": 0.833,
14
+ "routed_sql_exec_rate": 1.0,
15
+ "loss_curve": [
16
+ 2.3042,
17
+ 2.1773,
18
+ 2.0612,
19
+ 1.9523,
20
+ 1.8493,
21
+ 1.752,
22
+ 1.6601,
23
+ 1.5736,
24
+ 1.4922,
25
+ 1.4159,
26
+ 1.3444,
27
+ 1.2776,
28
+ 1.2152,
29
+ 1.1569,
30
+ 1.1026,
31
+ 1.052,
32
+ 1.0048,
33
+ 0.9607,
34
+ 0.9197,
35
+ 0.8813,
36
+ 0.8456,
37
+ 0.8122,
38
+ 0.7809,
39
+ 0.7517,
40
+ 0.7243,
41
+ 0.6987,
42
+ 0.6746,
43
+ 0.6521,
44
+ 0.6308,
45
+ 0.6109,
46
+ 0.5921,
47
+ 0.5744,
48
+ 0.5577,
49
+ 0.542,
50
+ 0.5271,
51
+ 0.513,
52
+ 0.4997,
53
+ 0.4871,
54
+ 0.4752,
55
+ 0.4638
56
+ ],
57
+ "final_loss": 0.4541,
58
+ "labels": [
59
+ "spend_by_month",
60
+ "top_vendors",
61
+ "late_vendors",
62
+ "late_rate",
63
+ "spend_by_category",
64
+ "why_q2",
65
+ "below_reorder",
66
+ "open_invoices",
67
+ "return_reasons",
68
+ "ap_health"
69
+ ],
70
+ "backend": "local",
71
+ "dataset_jsonl": "backend/finetune/erp_sft.jsonl",
72
+ "production_recipe": {
73
+ "base_model": "openbmb/MiniCPM3-4B",
74
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
75
+ "dataset": "backend/finetune/erp_sft.jsonl",
76
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
77
+ "hyperparams": {
78
+ "lora_r": 16,
79
+ "lora_alpha": 32,
80
+ "lora_dropout": 0.05,
81
+ "target_modules": [
82
+ "q_proj",
83
+ "k_proj",
84
+ "v_proj",
85
+ "o_proj"
86
+ ],
87
+ "learning_rate": 0.0002,
88
+ "num_train_epochs": 3,
89
+ "per_device_train_batch_size": 8,
90
+ "gradient_accumulation_steps": 2,
91
+ "max_seq_length": 1024,
92
+ "bf16": true
93
+ },
94
+ "command": "python scripts/finetune_erp.py --backend hf",
95
+ "requirements": [
96
+ "torch",
97
+ "transformers>=4.44",
98
+ "peft",
99
+ "trl",
100
+ "accelerate",
101
+ "datasets"
102
+ ]
103
+ },
104
+ "base_model_for_production": "openbmb/MiniCPM3-4B",
105
+ "generated_at": 1781324653.616286
106
+ }
results/erp_sft.jsonl ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why is Q2 spend up?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
2
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize our AP position.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
3
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Vendors most often paid after due date?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
4
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much do we owe in unpaid invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
5
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much did we invoice each month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
6
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend grouped by category.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
7
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What drove the Q2 spend increase?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
8
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which products are low on stock?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
9
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "State of our AP overall?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
10
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What was total invoiced spend by month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
11
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show monthly spend.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
12
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Categories ranked by spend.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
13
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Where does spend go by category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
14
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total open AP balance?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
15
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Share of late payments?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
16
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Rank vendors by spend.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
17
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overall on-time vs late rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
18
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List our biggest suppliers.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
19
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who are our worst late-paying vendors?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
20
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Return reason breakdown by money?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
21
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What should we reorder?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
22
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which items risk stockout?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
23
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors get the most of our money?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
24
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What are the leading causes of returns?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
25
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show spend by category.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
26
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Late-payment ratio across all invoices?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
27
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's running low in inventory?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
28
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Biggest return reasons by refund value?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
29
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How healthy are our payables?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
30
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers have payment delays?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
31
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which return reasons cost the most?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
32
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break spend down per month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
33
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Category spend totals?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
34
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Percentage of overdue payments overall?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
35
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "AP health check please.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
36
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which categories cost the most?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
37
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Monthly AP spend totals?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
38
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What caused the second-quarter cost jump?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
39
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who keeps getting paid past terms?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
40
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Account for the Q2 2026 spend surge.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
41
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's the state of accounts payable?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
42
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Our overall late payment percentage?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
43
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Inventory below reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
44
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What is the total value of open invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
45
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top suppliers ranked by spend.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
46
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List vendors by late-payment count.", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
47
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Late payers among our vendors?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
48
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total spend for each category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
49
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What is the late-payment rate overall?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
50
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How are we doing on accounts payable?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
51
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show returns grouped by reason.", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
52
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Explain the spend spike in Q2.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
53
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our late payment rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
54
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break down returns by reason.", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
55
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Outstanding invoice value?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
56
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "List SKUs below reorder threshold.", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
57
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give an executive AP summary.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
58
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total of invoices not yet paid?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
59
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why did spend rise in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
60
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our outstanding payables total?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
61
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Worst offenders for late payment?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
62
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Our highest-spend vendors?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
63
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much do we spend per product category?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
64
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why are products being returned?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
65
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Reason for higher spending in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
66
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Vendors with the most overdue payments?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
67
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much is still open in payables?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
68
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Return reasons ranked by refunds?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
69
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overall accounts payable summary?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
70
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give the global late payment percentage.", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
71
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors paid late most often?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
72
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors do we spend the most with?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
73
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Largest vendors please.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
74
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "SKUs needing replenishment?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
75
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How often do we pay late, as a rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
76
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Value of unpaid invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
77
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How much AP is still open?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
78
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Drivers of the Q2 spend rise?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
79
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show the five largest vendors.", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
80
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What drives our refunds?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
81
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What percent of invoices are paid late?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
82
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers cost us the most?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
83
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Explain why Q2 spend went up.", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
84
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend by period please.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
85
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Open invoice liability?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
86
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Overview of payables health?", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
87
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize payables status.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
88
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Sum of open invoices?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
89
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What needs replenishing?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
90
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top return reasons by refund amount?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
91
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give me an AP health overview.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
92
+ {"task": "nlq", "intent": "analytics", "template": "open_invoices", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Unpaid invoice amount overall?", "output": "SELECT ROUND(SUM(total),2) AS open_value, COUNT(*) AS n FROM invoices WHERE status='open'"}
93
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Stock positions under reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
94
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show vendors with frequent late payments.", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
95
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Top vendors by total spend?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
96
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which SKUs are below reorder point?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
97
+ {"task": "nlq", "intent": "summary", "template": "ap_health", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Summarize accounts payable health.", "output": "SELECT (SELECT COUNT(*) FROM invoices) AS invoices, (SELECT COUNT(*) FROM invoices WHERE status='open') AS open_invoices, (SELECT ROUND(AVG(days_to_pay),1) FROM invoices WHERE status='paid') AS avg_days_to_pay"}
98
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Spend per category please.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
99
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why did costs climb in Q2 2026?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
100
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Most costly return reasons?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
101
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which vendors are habitually overdue?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
102
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our spend over the months?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
103
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Break down spend across categories.", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
104
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's our category spend mix?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
105
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How has spend trended month to month?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
106
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Monthly invoiced spend trend?", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
107
+ {"task": "nlq", "intent": "analytics", "template": "late_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which suppliers do we pay late?", "output": "SELECT v.name, SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END) AS late_invoices, COUNT(i.invoice_id) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid' GROUP BY v.vendor_id HAVING late_invoices > 0 ORDER BY late_invoices DESC LIMIT 5"}
108
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Why was Q2 so expensive?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
109
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Total spend grouped by month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
110
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Show items under their reorder level.", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
111
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "How bad is our late-payment rate?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
112
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Who are the top 5 vendors by spend?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
113
+ {"task": "nlq", "intent": "reasons", "template": "why_q2", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What's behind the Q2 increase?", "output": "SELECT period, account, ROUND(SUM(amount),2) AS spend FROM gl_entries WHERE period >= '2026-04' AND period <= '2026-06' GROUP BY period, account ORDER BY period, spend DESC"}
114
+ {"task": "nlq", "intent": "analytics", "template": "return_reasons", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Refund totals per return reason?", "output": "SELECT reason, COUNT(*) AS returns, ROUND(SUM(refund_amount),2) AS refunds FROM returns GROUP BY reason ORDER BY refunds DESC"}
115
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Plot spend per month.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
116
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_month", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Give me the month-by-month spend.", "output": "SELECT period, ROUND(SUM(amount),2) AS spend FROM gl_entries GROUP BY period ORDER BY period"}
117
+ {"task": "nlq", "intent": "analytics", "template": "below_reorder", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Which products fell below reorder?", "output": "SELECT i.sku, p.name, i.region, i.on_hand, i.reorder_point FROM inventory i JOIN products p ON p.sku=i.sku WHERE i.on_hand < i.reorder_point ORDER BY (i.reorder_point - i.on_hand) DESC LIMIT 15"}
118
+ {"task": "nlq", "intent": "analytics", "template": "spend_by_category", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Category-level spend breakdown?", "output": "SELECT p.category, ROUND(SUM(l.line_total),2) AS spend FROM po_lines l JOIN products p ON p.sku=l.sku JOIN purchase_orders po ON po.po_id=l.po_id WHERE po.status!='cancelled' GROUP BY p.category ORDER BY spend DESC"}
119
+ {"task": "nlq", "intent": "analytics", "template": "top_vendors", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "Biggest vendors by invoice value?", "output": "SELECT v.name, ROUND(SUM(i.total),2) AS spend, COUNT(*) AS invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id GROUP BY v.vendor_id ORDER BY spend DESC LIMIT 5"}
120
+ {"task": "nlq", "intent": "analytics", "template": "late_rate", "instruction": "Translate this ERP question into one SQLite SELECT over the warehouse schema.", "input": "What fraction of payments miss terms?", "output": "SELECT ROUND(100.0*SUM(CASE WHEN i.days_to_pay > CAST(substr(v.payment_terms,5) AS INT) THEN 1 ELSE 0 END)/COUNT(*),1) AS late_pct, COUNT(*) AS paid_invoices FROM invoices i JOIN vendors v ON v.vendor_id=i.vendor_id WHERE i.status='paid'"}
results/ocr_quality_report.json ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generated_at": 1781326743.1110458,
3
+ "note": "CER/WER vs .txt sidecar reference; field accuracy vs gt.json. sidecar = reference text source (CER\u22480 by construction).",
4
+ "models": [
5
+ {
6
+ "lab": "OpenBMB",
7
+ "name": "MiniCPM-V-4.6",
8
+ "params_b": 8.0,
9
+ "size_gb_int4": 5.5,
10
+ "modality": "vision-language (OCR + reasoning)",
11
+ "role": "OCR backend + LLM extractor",
12
+ "available": true
13
+ },
14
+ {
15
+ "lab": "OpenBMB",
16
+ "name": "MiniCPM-o-4.5",
17
+ "params_b": 8.0,
18
+ "size_gb_int4": 5.5,
19
+ "modality": "omni (vision/audio) VLM",
20
+ "role": "alt OCR/VLM",
21
+ "available": true
22
+ },
23
+ {
24
+ "lab": "OpenBMB",
25
+ "name": "MiniCPM3-4B",
26
+ "params_b": 4.0,
27
+ "size_gb_int4": 2.8,
28
+ "modality": "text LLM (reasoning + function-calling, 32k ctx)",
29
+ "role": "ERP reasoning \u00b7 NLQ\u2192SQL \u00b7 report summarization (fine-tune target)",
30
+ "available": true
31
+ },
32
+ {
33
+ "lab": "Cohere",
34
+ "name": "Aya-Vision-8B",
35
+ "params_b": 8.0,
36
+ "size_gb_int4": 6.0,
37
+ "modality": "vision-language (OCR/VQA, 23 langs)",
38
+ "role": "OCR backend",
39
+ "available": false
40
+ },
41
+ {
42
+ "lab": "Cohere",
43
+ "name": "Aya-Vision-32B",
44
+ "params_b": 32.0,
45
+ "size_gb_int4": 18.0,
46
+ "modality": "vision-language (OCR/VQA)",
47
+ "role": "alt OCR backend (max-quality small)",
48
+ "available": false
49
+ },
50
+ {
51
+ "lab": "Cohere",
52
+ "name": "Command R7B",
53
+ "params_b": 7.0,
54
+ "size_gb_int4": 5.0,
55
+ "modality": "text LLM (RAG + tool-use + reasoning, 128k ctx)",
56
+ "role": "ERP RAG \u00b7 NLQ \u00b7 grounded reasoning",
57
+ "available": false
58
+ },
59
+ {
60
+ "lab": "Black Forest Labs",
61
+ "name": "FLUX.1 [dev]",
62
+ "params_b": 12.0,
63
+ "size_gb_int4": 12.0,
64
+ "modality": "text-to-image GENERATION",
65
+ "role": "synthetic test-document generator (not OCR)",
66
+ "available": false
67
+ },
68
+ {
69
+ "lab": "Black Forest Labs",
70
+ "name": "FLUX.1 [schnell]",
71
+ "params_b": 12.0,
72
+ "size_gb_int4": 12.0,
73
+ "modality": "text-to-image GENERATION (fast)",
74
+ "role": "synthetic test-document generator",
75
+ "available": false
76
+ }
77
+ ],
78
+ "backends": [
79
+ {
80
+ "backend": "minicpm",
81
+ "model": "MiniCPM-V-4.6",
82
+ "params_b": 8.0,
83
+ "size_gb": 5.5,
84
+ "lab": "OpenBMB",
85
+ "is_reference": false,
86
+ "cer": 0.0262,
87
+ "wer": 0.0876,
88
+ "field_exact_match": 0.907,
89
+ "field_f1": 0.9397,
90
+ "avg_latency_ms": 6524.8167,
91
+ "avg_cost_usd": 0.0002,
92
+ "samples_scored": 6,
93
+ "per_sample": [
94
+ {
95
+ "sample": "invoice_scanned_basic",
96
+ "cer": 0.0,
97
+ "wer": 0.0,
98
+ "field_exact": 0.889,
99
+ "field_f1": 0.9,
100
+ "latency_ms": 5560.8,
101
+ "cost_usd": 0.0001952,
102
+ "confidence": 0.7
103
+ },
104
+ {
105
+ "sample": "receipt_scanned",
106
+ "cer": 0.0942,
107
+ "wer": 0.3103,
108
+ "field_exact": 1.0,
109
+ "field_f1": 1.0,
110
+ "latency_ms": 4218.7,
111
+ "cost_usd": 0.0001883,
112
+ "confidence": 0.98
113
+ },
114
+ {
115
+ "sample": "po_scanned",
116
+ "cer": 0.0368,
117
+ "wer": 0.1277,
118
+ "field_exact": 1.0,
119
+ "field_f1": 1.0,
120
+ "latency_ms": 4404.9,
121
+ "cost_usd": 0.0001835,
122
+ "confidence": 0.98
123
+ },
124
+ {
125
+ "sample": "contract_scanned",
126
+ "cer": 0.0,
127
+ "wer": 0.0,
128
+ "field_exact": 0.636,
129
+ "field_f1": 0.8,
130
+ "latency_ms": 6532.2,
131
+ "cost_usd": 0.000166,
132
+ "confidence": 0.98
133
+ },
134
+ {
135
+ "sample": "subscription_memo_scanned",
136
+ "cer": 0.0,
137
+ "wer": 0.0,
138
+ "field_exact": 0.917,
139
+ "field_f1": 0.938,
140
+ "latency_ms": 5010.4,
141
+ "cost_usd": 0.0001924,
142
+ "confidence": 0.98
143
+ },
144
+ {
145
+ "sample": "complex_invoice_messy",
146
+ "cer": null,
147
+ "wer": null,
148
+ "field_exact": 1.0,
149
+ "field_f1": 1.0,
150
+ "latency_ms": 13421.9,
151
+ "cost_usd": 0.0004414,
152
+ "confidence": 0.98
153
+ }
154
+ ]
155
+ },
156
+ {
157
+ "backend": "tesseract",
158
+ "model": "tesseract",
159
+ "params_b": null,
160
+ "size_gb": null,
161
+ "lab": "classic",
162
+ "is_reference": false,
163
+ "cer": 0.1468,
164
+ "wer": 0.1848,
165
+ "field_exact_match": 0.907,
166
+ "field_f1": 0.9397,
167
+ "avg_latency_ms": 3436.8667,
168
+ "avg_cost_usd": 0.0001,
169
+ "samples_scored": 6,
170
+ "per_sample": [
171
+ {
172
+ "sample": "invoice_scanned_basic",
173
+ "cer": 0.1225,
174
+ "wer": 0.1389,
175
+ "field_exact": 0.889,
176
+ "field_f1": 0.9,
177
+ "latency_ms": 3698.6,
178
+ "cost_usd": 0.0001242,
179
+ "confidence": 0.68
180
+ },
181
+ {
182
+ "sample": "receipt_scanned",
183
+ "cer": 0.4555,
184
+ "wer": 0.5172,
185
+ "field_exact": 1.0,
186
+ "field_f1": 1.0,
187
+ "latency_ms": 2861.1,
188
+ "cost_usd": 0.0001207,
189
+ "confidence": 0.96
190
+ },
191
+ {
192
+ "sample": "po_scanned",
193
+ "cer": 0.0951,
194
+ "wer": 0.1489,
195
+ "field_exact": 1.0,
196
+ "field_f1": 1.0,
197
+ "latency_ms": 3390.6,
198
+ "cost_usd": 0.000118,
199
+ "confidence": 0.96
200
+ },
201
+ {
202
+ "sample": "contract_scanned",
203
+ "cer": 0.0,
204
+ "wer": 0.0,
205
+ "field_exact": 0.636,
206
+ "field_f1": 0.8,
207
+ "latency_ms": 2336.4,
208
+ "cost_usd": 7.97e-05,
209
+ "confidence": 0.96
210
+ },
211
+ {
212
+ "sample": "subscription_memo_scanned",
213
+ "cer": 0.061,
214
+ "wer": 0.119,
215
+ "field_exact": 0.917,
216
+ "field_f1": 0.938,
217
+ "latency_ms": 2243.8,
218
+ "cost_usd": 0.0001211,
219
+ "confidence": 0.96
220
+ },
221
+ {
222
+ "sample": "complex_invoice_messy",
223
+ "cer": null,
224
+ "wer": null,
225
+ "field_exact": 1.0,
226
+ "field_f1": 1.0,
227
+ "latency_ms": 6090.7,
228
+ "cost_usd": 0.0002828,
229
+ "confidence": 0.96
230
+ }
231
+ ]
232
+ },
233
+ {
234
+ "backend": "sidecar",
235
+ "model": "sidecar",
236
+ "params_b": null,
237
+ "size_gb": null,
238
+ "lab": "classic",
239
+ "is_reference": true,
240
+ "cer": 0.0,
241
+ "wer": 0.0,
242
+ "field_exact_match": 0.907,
243
+ "field_f1": 0.9397,
244
+ "avg_latency_ms": 3235.3167,
245
+ "avg_cost_usd": 0.0001,
246
+ "samples_scored": 6,
247
+ "per_sample": [
248
+ {
249
+ "sample": "invoice_scanned_basic",
250
+ "cer": 0.0,
251
+ "wer": 0.0,
252
+ "field_exact": 0.889,
253
+ "field_f1": 0.9,
254
+ "latency_ms": 1697.6,
255
+ "cost_usd": 9.45e-05,
256
+ "confidence": 0.66
257
+ },
258
+ {
259
+ "sample": "receipt_scanned",
260
+ "cer": 0.0,
261
+ "wer": 0.0,
262
+ "field_exact": 1.0,
263
+ "field_f1": 1.0,
264
+ "latency_ms": 2126.2,
265
+ "cost_usd": 0.0001212,
266
+ "confidence": 0.94
267
+ },
268
+ {
269
+ "sample": "po_scanned",
270
+ "cer": 0.0,
271
+ "wer": 0.0,
272
+ "field_exact": 1.0,
273
+ "field_f1": 1.0,
274
+ "latency_ms": 2194.4,
275
+ "cost_usd": 0.0001184,
276
+ "confidence": 0.94
277
+ },
278
+ {
279
+ "sample": "contract_scanned",
280
+ "cer": 0.0,
281
+ "wer": 0.0,
282
+ "field_exact": 0.636,
283
+ "field_f1": 0.8,
284
+ "latency_ms": 1522.1,
285
+ "cost_usd": 7.97e-05,
286
+ "confidence": 0.94
287
+ },
288
+ {
289
+ "sample": "subscription_memo_scanned",
290
+ "cer": 0.0,
291
+ "wer": 0.0,
292
+ "field_exact": 0.917,
293
+ "field_f1": 0.938,
294
+ "latency_ms": 1632.7,
295
+ "cost_usd": 9.16e-05,
296
+ "confidence": 0.94
297
+ },
298
+ {
299
+ "sample": "complex_invoice_messy",
300
+ "cer": null,
301
+ "wer": null,
302
+ "field_exact": 1.0,
303
+ "field_f1": 1.0,
304
+ "latency_ms": 10238.9,
305
+ "cost_usd": 0.0002297,
306
+ "confidence": 0.98
307
+ }
308
+ ]
309
+ }
310
+ ],
311
+ "best_ocr_quality": "minicpm",
312
+ "best_document_analysis": "minicpm"
313
+ }
scripts/finetune_erp.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Fine-tune a small model on the simulated ERP domain — two backends, one dataset.
3
+
4
+ python scripts/finetune_erp.py # offline CPU demo (default) — runs anywhere
5
+ python scripts/finetune_erp.py --backend hf # real LoRA on OpenBMB MiniCPM3-4B (needs GPU)
6
+
7
+ Outputs (committed/published):
8
+ backend/finetune/erp_sft.jsonl instruction-tuning dataset from the ERP KB
9
+ backend/finetune/erp_finetune_report.json before→after metrics + loss curve (served at
10
+ /api/erp/finetune-report and shown in the UI)
11
+ backend/finetune/runs/<ts>/ per-run snapshot
12
+
13
+ The `hf` backend builds the exact PEFT/TRL SFTTrainer config for MiniCPM3-4B and (if
14
+ torch+peft+trl are installed and a GPU is present) runs it; otherwise it writes the
15
+ ready-to-run recipe so it can be launched on a GPU box / HF Space / Colab unchanged.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import sys
22
+ import time
23
+ from pathlib import Path
24
+
25
+ ROOT = Path(__file__).resolve().parent.parent
26
+ sys.path.insert(0, str(ROOT / "backend"))
27
+
28
+ from app.config import get_settings # noqa: E402
29
+ from app.erp.finetune import build_dataset, run_offline_finetune # noqa: E402
30
+
31
+ FT_DIR = ROOT / "backend" / "finetune"
32
+ BASE_MODEL = "openbmb/MiniCPM3-4B"
33
+
34
+
35
+ def _lora_recipe(jsonl: Path) -> dict:
36
+ """The production recipe: LoRA SFT of OpenBMB MiniCPM3-4B on the ERP dataset."""
37
+ return {
38
+ "base_model": BASE_MODEL,
39
+ "method": "LoRA (PEFT) supervised fine-tuning (TRL SFTTrainer)",
40
+ "dataset": str(jsonl.relative_to(ROOT)),
41
+ "prompt_template": "{instruction}\n\nERP question: {input}\nSQL:",
42
+ "hyperparams": {
43
+ "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05,
44
+ "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
45
+ "learning_rate": 2e-4, "num_train_epochs": 3, "per_device_train_batch_size": 8,
46
+ "gradient_accumulation_steps": 2, "max_seq_length": 1024, "bf16": True,
47
+ },
48
+ "command": "python scripts/finetune_erp.py --backend hf",
49
+ "requirements": ["torch", "transformers>=4.44", "peft", "trl", "accelerate", "datasets"],
50
+ }
51
+
52
+
53
+ def _run_hf(jsonl: Path, settings) -> dict:
54
+ """Run a real LoRA SFT if the stack is present; else emit the runnable recipe."""
55
+ recipe = _lora_recipe(jsonl)
56
+ try:
57
+ import torch # noqa
58
+ from datasets import load_dataset # noqa
59
+ from peft import LoraConfig # noqa
60
+ from transformers import AutoModelForCausalLM, AutoTokenizer # noqa
61
+ from trl import SFTConfig, SFTTrainer # noqa
62
+ except Exception as e:
63
+ return {"backend": "hf", "ran": False, "reason": f"training stack unavailable ({e})",
64
+ "recipe": recipe,
65
+ "note": "Dataset + recipe are ready; launch on a GPU box to fine-tune MiniCPM3-4B."}
66
+
67
+ import torch
68
+ from datasets import load_dataset
69
+ from peft import LoraConfig
70
+ from transformers import AutoModelForCausalLM, AutoTokenizer
71
+ from trl import SFTConfig, SFTTrainer
72
+
73
+ tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
74
+ model = AutoModelForCausalLM.from_pretrained(
75
+ BASE_MODEL, trust_remote_code=True,
76
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
77
+ device_map="auto")
78
+ ds = load_dataset("json", data_files=str(jsonl), split="train")
79
+
80
+ def fmt(ex):
81
+ return {"text": f"{ex['instruction']}\n\nERP question: {ex['input']}\nSQL: {ex['output']}{tok.eos_token}"}
82
+
83
+ ds = ds.map(fmt)
84
+ out = FT_DIR / "runs" / f"hf_{time.strftime('%Y%m%dT%H%M%S')}"
85
+ trainer = SFTTrainer(
86
+ model=model,
87
+ train_dataset=ds,
88
+ peft_config=LoraConfig(**{k: recipe["hyperparams"][k] for k in
89
+ ("lora_r", "lora_alpha", "lora_dropout", "target_modules")},
90
+ task_type="CAUSAL_LM"),
91
+ args=SFTConfig(output_dir=str(out), num_train_epochs=3, per_device_train_batch_size=8,
92
+ learning_rate=2e-4, logging_steps=10, max_seq_length=1024,
93
+ bf16=torch.cuda.is_available()))
94
+ res = trainer.train()
95
+ trainer.save_model(str(out))
96
+ return {"backend": "hf", "ran": True, "adapter_dir": str(out),
97
+ "train_loss": float(getattr(res, "training_loss", 0.0)), "recipe": recipe}
98
+
99
+
100
+ def main() -> None:
101
+ ap = argparse.ArgumentParser()
102
+ ap.add_argument("--backend", choices=["local", "hf"], default="local")
103
+ ap.add_argument("--epochs", type=int, default=400)
104
+ args = ap.parse_args()
105
+
106
+ settings = get_settings()
107
+ FT_DIR.mkdir(parents=True, exist_ok=True)
108
+ (FT_DIR / "runs").mkdir(exist_ok=True)
109
+
110
+ # 1) build + write the shared instruction-tuning dataset
111
+ data = build_dataset()
112
+ jsonl = FT_DIR / "erp_sft.jsonl"
113
+ jsonl.write_text("\n".join(json.dumps(r) for r in data) + "\n")
114
+
115
+ # 2) train
116
+ if args.backend == "local":
117
+ result = run_offline_finetune(settings, epochs=args.epochs)
118
+ result["backend"] = "local"
119
+ result["dataset_jsonl"] = str(jsonl.relative_to(ROOT))
120
+ result["production_recipe"] = _lora_recipe(jsonl)
121
+ else:
122
+ result = _run_hf(jsonl, settings)
123
+ # always include the offline metrics too, so the UI has a populated report
124
+ result["offline_demo"] = run_offline_finetune(settings, epochs=args.epochs)
125
+
126
+ result["base_model_for_production"] = BASE_MODEL
127
+ result["generated_at"] = time.time()
128
+
129
+ # 3) publish
130
+ report = FT_DIR / "erp_finetune_report.json"
131
+ report.write_text(json.dumps(result, indent=2))
132
+ snap = FT_DIR / "runs" / f"{args.backend}_{time.strftime('%Y%m%dT%H%M%S')}.json"
133
+ snap.write_text(json.dumps(result, indent=2))
134
+
135
+ # 4) print a readout
136
+ r = result if args.backend == "local" else result.get("offline_demo", {})
137
+ print("\n" + "=" * 78)
138
+ print(" ERP DOMAIN FINE-TUNE (backend: %s)" % args.backend)
139
+ print("=" * 78)
140
+ print(f" dataset : {len(data)} examples → {jsonl.relative_to(ROOT)}")
141
+ print(f" production target : {BASE_MODEL} (LoRA recipe emitted)")
142
+ if r:
143
+ print(f" offline trainer : {r['model']}")
144
+ print(f" classes={r['n_classes']} train={r['train']} test={r['test']} params={r['trainable_params']:,}")
145
+ print(f" BEFORE test-acc : {r['before_test_accuracy']*100:5.1f}%")
146
+ print(f" AFTER test-acc : {r['after_test_accuracy']*100:5.1f}% (+{r['accuracy_gain']*100:.1f} pts)")
147
+ print(f" routed-SQL exec : {r['routed_sql_exec_rate']*100:.1f}% final loss={r['final_loss']}")
148
+ print(f" published : {report.relative_to(ROOT)}")
149
+ print("=" * 78 + "\n")
150
+
151
+
152
+ if __name__ == "__main__":
153
+ main()
scripts/generate_extreme_docs.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Generate EXTREMELY hard OCR documents — embedded images + heavy degradation:
3
+
4
+ 1. extreme_receipt_photo — thermal receipt PHOTOGRAPHED on a desk: perspective
5
+ warp, uneven lighting, shadow, crinkle lines, faded thermal band, printed logo.
6
+ 2. extreme_po_collage — image-heavy purchase order: product THUMBNAIL IMAGES
7
+ in table rows, QR code, barcode, rotated APPROVED stamp over the table,
8
+ signature scribble, misaligned columns.
9
+ 3. extreme_contract_fax — dense two-column contract received BY FAX: low
10
+ contrast, salt-and-pepper noise, skew, scanline streaks, punch-hole shadows,
11
+ handwritten blue margin note, red RECEIVED stamp.
12
+
13
+ Each writes <id>.png + <id>.gt.json + <id>.txt (sidecar reference text, drawn from
14
+ the SAME strings as the image so CER/WER is fair). All are tagged skip_eval so the
15
+ main eval harness is unchanged; the OCR quality benchmark picks them up.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import math
21
+ import random
22
+ from pathlib import Path
23
+
24
+ import numpy as np
25
+ from PIL import Image, ImageDraw, ImageFilter, ImageFont
26
+
27
+ ROOT = Path(__file__).resolve().parent.parent
28
+ OUT = ROOT / "backend" / "evals" / "datasets"
29
+ rng = random.Random(42)
30
+
31
+
32
+ def font(sz, bold=False, mono=False):
33
+ paths = (["/System/Library/Fonts/Supplemental/Courier New Bold.ttf",
34
+ "/System/Library/Fonts/Supplemental/Courier New.ttf"] if mono else []) + [
35
+ "/System/Library/Fonts/Supplemental/Arial Bold.ttf" if bold else "/System/Library/Fonts/Supplemental/Arial.ttf",
36
+ "/System/Library/Fonts/Helvetica.ttc",
37
+ "/Library/Fonts/Arial.ttf",
38
+ ]
39
+ for p in paths:
40
+ try:
41
+ return ImageFont.truetype(p, sz)
42
+ except Exception:
43
+ continue
44
+ return ImageFont.load_default()
45
+
46
+
47
+ def _find_coeffs(dst, src):
48
+ """Perspective coefficients so that src corners land on dst corners."""
49
+ A, B = [], []
50
+ for (x, y), (u, v) in zip(dst, src):
51
+ A.append([x, y, 1, 0, 0, 0, -u * x, -u * y])
52
+ A.append([0, 0, 0, x, y, 1, -v * x, -v * y])
53
+ B.extend([u, v])
54
+ res, *_ = np.linalg.lstsq(np.array(A, float), np.array(B, float), rcond=None)
55
+ return res.tolist()
56
+
57
+
58
+ def stamp(text, color, angle, size=(360, 120), fsz=34):
59
+ im = Image.new("RGBA", size, (0, 0, 0, 0))
60
+ d = ImageDraw.Draw(im)
61
+ d.rounded_rectangle([4, 4, size[0] - 4, size[1] - 4], radius=16, outline=color + (190,), width=5)
62
+ f = font(fsz, bold=True)
63
+ tw = d.textlength(text, font=f)
64
+ d.text(((size[0] - tw) / 2, (size[1] - fsz) / 2 - 6), text, font=f, fill=color + (190,))
65
+ return im.rotate(angle, expand=True, resample=Image.BICUBIC)
66
+
67
+
68
+ def signature(w=220, h=60, color=(25, 30, 120)):
69
+ im = Image.new("RGBA", (w, h), (0, 0, 0, 0))
70
+ d = ImageDraw.Draw(im)
71
+ pts = []
72
+ for i in range(60):
73
+ t = i / 59
74
+ x = 8 + t * (w - 16)
75
+ y = h / 2 + math.sin(t * 14 + 1.3) * (h / 3) * (1 - 0.5 * t) + rng.uniform(-2, 2)
76
+ pts.append((x, y))
77
+ d.line(pts, fill=color + (230,), width=3, joint="curve")
78
+ return im
79
+
80
+
81
+ # ── 1. extreme_receipt_photo ──────────────────────────────────────────────────
82
+ R_LINES = [
83
+ "BREW & BEAN COFFEE Co.",
84
+ "412 Harbor Lane, Portland OR",
85
+ "Receipt #R-88341 Reg 02",
86
+ "Date: 2026-06-02 14:37",
87
+ "Currency: USD",
88
+ "--------------------------------",
89
+ "Flat White 2 x 4.75 9.50",
90
+ "Butter Croissant 3 x 3.25 9.75",
91
+ "Cold Brew Growler 1 x 14.00 14.00",
92
+ "Loyalty discount -2.50",
93
+ "--------------------------------",
94
+ "Subtotal 30.75",
95
+ "Tax 8.8% 2.71",
96
+ "TOTAL 33.46",
97
+ "Payment: VISA ****4421",
98
+ "--------------------------------",
99
+ "Thank you! brewandbean.example",
100
+ ]
101
+ R_GT = {
102
+ "doc_type": "receipt",
103
+ "merchant": "BREW & BEAN COFFEE Co.",
104
+ "date": "2026-06-02",
105
+ "currency": "USD",
106
+ "subtotal": 30.75,
107
+ "tax_amount": 2.71,
108
+ "total": 33.46,
109
+ "payment_method": "VISA ****4421",
110
+ "line_items": [
111
+ {"description": "Flat White", "quantity": 2, "unit_price": 4.75, "line_total": 9.50},
112
+ {"description": "Butter Croissant", "quantity": 3, "unit_price": 3.25, "line_total": 9.75},
113
+ {"description": "Cold Brew Growler", "quantity": 1, "unit_price": 14.00, "line_total": 14.00},
114
+ ],
115
+ "_meta": {"doc_type": "receipt", "channel": "photo", "difficulty": "extreme", "skip_eval": True},
116
+ }
117
+
118
+
119
+ def gen_receipt():
120
+ pw, ph = 560, 1010
121
+ paper = Image.new("RGBA", (pw, ph), (250, 248, 242, 255))
122
+ d = ImageDraw.Draw(paper)
123
+ # printed logo: filled coffee-cup glyph in a ring
124
+ cx, cy = pw // 2, 64
125
+ d.ellipse([cx - 44, cy - 44, cx + 44, cy + 44], outline=(60, 50, 45), width=4)
126
+ d.rounded_rectangle([cx - 20, cy - 14, cx + 14, cy + 22], radius=5, fill=(60, 50, 45))
127
+ d.arc([cx + 8, cy - 8, cx + 30, cy + 14], 270, 90, fill=(60, 50, 45), width=4)
128
+ fm = font(24, mono=True)
129
+ y = 130
130
+ for ln in R_LINES:
131
+ w = d.textlength(ln, font=fm)
132
+ x = (pw - w) / 2 if not ln.startswith(("Flat", "Butter", "Cold", "Loyal", "Subt", "Tax", "TOTAL", "Paym")) else 28
133
+ d.text((x, y), ln, font=fm, fill=(40, 38, 36))
134
+ y += 36
135
+ d.line([(0, ph - 14), (pw, ph - 6)], fill=(250, 248, 242, 0)) # keep bottom edge clean
136
+ # crinkle lines
137
+ for _ in range(7):
138
+ x0 = rng.randint(0, pw)
139
+ d.line([(x0, 0), (x0 + rng.randint(-90, 90), ph)], fill=(208, 204, 196, 90), width=2)
140
+ # faded thermal band (blend toward white)
141
+ arr = np.asarray(paper).astype(np.float32)
142
+ y0, y1 = 430, 560
143
+ fade = arr[y0:y1, :, :3]
144
+ arr[y0:y1, :, :3] = fade + (255 - fade) * 0.55
145
+ paper = Image.fromarray(arr.astype(np.uint8))
146
+
147
+ # desk background with wood grain + vignette
148
+ W, H = 1000, 1400
149
+ desk = Image.new("RGB", (W, H), (96, 74, 54))
150
+ dd = ImageDraw.Draw(desk)
151
+ for yy in range(0, H, 7):
152
+ dd.line([(0, yy), (W, yy + rng.randint(-3, 3))],
153
+ fill=(96 + rng.randint(-10, 8), 74 + rng.randint(-8, 6), 54 + rng.randint(-6, 6)), width=3)
154
+ # shadow under receipt
155
+ sh = Image.new("RGBA", (W, H), (0, 0, 0, 0))
156
+ ImageDraw.Draw(sh).polygon([(232, 152), (798, 198), (742, 1292), (172, 1232)], fill=(0, 0, 0, 110))
157
+ desk.paste(Image.new("RGB", (W, H), 0), (0, 0), sh.filter(ImageFilter.GaussianBlur(18)))
158
+ # perspective-warp the receipt onto the desk
159
+ dst = [(248, 138), (786, 186), (730, 1276), (188, 1218)]
160
+ coeffs = _find_coeffs(dst, [(0, 0), (pw, 0), (pw, ph), (0, ph)])
161
+ warped = paper.transform((W, H), Image.PERSPECTIVE, coeffs, Image.BICUBIC)
162
+ desk.paste(warped, (0, 0), warped)
163
+ # uneven lighting: bright top-left, dim bottom-right + vignette
164
+ a = np.asarray(desk).astype(np.float32)
165
+ yy, xx = np.mgrid[0:H, 0:W]
166
+ light = 1.12 - 0.32 * ((xx / W) * 0.6 + (yy / H) * 0.4)
167
+ r2 = ((xx - W / 2) / (W / 2)) ** 2 + ((yy - H / 2) / (H / 2)) ** 2
168
+ light *= 1 - 0.18 * np.clip(r2 - 0.45, 0, 1)
169
+ a *= light[..., None]
170
+ a += np.random.default_rng(7).normal(0, 4.5, a.shape)
171
+ img = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).filter(ImageFilter.GaussianBlur(0.6))
172
+ img.save(OUT / "extreme_receipt_photo.png")
173
+ (OUT / "extreme_receipt_photo.txt").write_text("\n".join(R_LINES) + "\n")
174
+ (OUT / "extreme_receipt_photo.gt.json").write_text(json.dumps(R_GT, indent=2))
175
+
176
+
177
+ # ── 2. extreme_po_collage ─────────────────────────────────────────────────────
178
+ PO_ITEMS = [
179
+ ("SHELF UNIT S-200 heavy gauge", 24, 189.00, 4536.00),
180
+ ("LED STRIP 2m retail white", 60, 22.40, 1344.00),
181
+ ("ENDCAP DISPLAY birch finish", 12, 310.00, 3720.00),
182
+ ]
183
+ PO_GT = {
184
+ "doc_type": "purchase_order",
185
+ "order_number": "PO-77RX-3309",
186
+ "order_date": "2026-05-21",
187
+ "delivery_date": "2026-06-15",
188
+ "vendor_name": "Nordic Fixture Works AB",
189
+ "buyer_name": "Aperture Retail Group",
190
+ "ship_to": "DC-7, 4420 Logistics Pkwy, Columbus OH",
191
+ "currency": "USD",
192
+ "payment_terms": "Net 45",
193
+ "subtotal": 9600.00,
194
+ "tax_amount": 792.00,
195
+ "total": 10392.00,
196
+ "line_items": [{"description": d_, "quantity": q, "unit_price": u, "line_total": t}
197
+ for d_, q, u, t in PO_ITEMS],
198
+ "_meta": {"doc_type": "purchase_order", "channel": "scanned", "difficulty": "extreme", "skip_eval": True},
199
+ }
200
+
201
+
202
+ def _thumb(kind):
203
+ im = Image.new("RGB", (76, 76), (235, 238, 242))
204
+ d = ImageDraw.Draw(im)
205
+ if kind == 0: # shelf unit
206
+ for i in range(4):
207
+ d.rectangle([10, 12 + i * 15, 66, 18 + i * 15], fill=(120, 128, 140))
208
+ d.line([(12, 12), (12, 66)], fill=(80, 86, 96), width=3)
209
+ d.line([(64, 12), (64, 66)], fill=(80, 86, 96), width=3)
210
+ elif kind == 1: # LED strip
211
+ d.rounded_rectangle([8, 30, 68, 46], radius=8, fill=(60, 64, 70))
212
+ for x in range(14, 66, 9):
213
+ d.ellipse([x, 34, x + 6, 42], fill=(255, 240, 160))
214
+ else: # endcap display
215
+ d.polygon([(14, 64), (26, 14), (50, 14), (62, 64)], fill=(196, 164, 120))
216
+ d.rectangle([20, 40, 56, 46], fill=(160, 128, 88))
217
+ d.rectangle([24, 26, 52, 32], fill=(160, 128, 88))
218
+ d.rectangle([0, 0, 75, 75], outline=(150, 150, 150))
219
+ return im
220
+
221
+
222
+ def _qr(d, x, y, n=21, cell=5):
223
+ g = random.Random(9)
224
+ for r in range(n):
225
+ for c in range(n):
226
+ if g.random() < 0.45:
227
+ d.rectangle([x + c * cell, y + r * cell, x + c * cell + cell - 1, y + r * cell + cell - 1], fill=0)
228
+ for fx, fy in [(0, 0), (n - 7, 0), (0, n - 7)]: # finder squares
229
+ d.rectangle([x + fx * cell, y + fy * cell, x + (fx + 7) * cell, y + (fy + 7) * cell], outline=0, width=3)
230
+ d.rectangle([x + (fx + 2) * cell, y + (fy + 2) * cell, x + (fx + 5) * cell, y + (fy + 5) * cell], fill=0)
231
+
232
+
233
+ def gen_po():
234
+ W, H = 1240, 1600
235
+ im = Image.new("RGB", (W, H), (252, 252, 250))
236
+ d = ImageDraw.Draw(im)
237
+ h1, h2, h3, body, small = font(40, True), font(22, True), font(18, True), font(19), font(15)
238
+ # header: drawn logo + vendor (left), meta box (right), QR top-right corner
239
+ d.rectangle([40, 40, 120, 120], fill=(30, 90, 160))
240
+ d.polygon([(52, 108), (80, 52), (108, 108)], fill=(252, 252, 250))
241
+ d.text((136, 48), "Nordic Fixture Works AB", font=h2, fill=(20, 20, 30))
242
+ d.text((136, 80), "Industrigatan 14, Malmo SE · VAT SE5566778899", font=small, fill=(90, 90, 100))
243
+ d.text((40, 150), "PURCHASE ORDER", font=h1, fill=(30, 90, 160))
244
+ _qr(d, 1060, 40)
245
+ meta = [("PO Number:", "PO-77RX-3309"), ("Order Date:", "2026-05-21"),
246
+ ("Delivery Date:", "2026-06-15"), ("Payment Terms:", "Net 45"), ("Currency:", "USD")]
247
+ d.rounded_rectangle([720, 150, 1200, 320], radius=10, outline=(30, 90, 160), width=2)
248
+ for i, (k, v) in enumerate(meta):
249
+ d.text((740, 165 + i * 30), k, font=h3, fill=(90, 90, 100))
250
+ d.text((920, 165 + i * 30), v, font=body, fill=(20, 20, 30))
251
+ d.text((40, 230), "Buyer: Aperture Retail Group", font=body, fill=(20, 20, 30))
252
+ d.text((40, 260), "Ship To: DC-7, 4420 Logistics Pkwy, Columbus OH", font=body, fill=(20, 20, 30))
253
+
254
+ # table with thumbnails + deliberately misaligned columns
255
+ d.rectangle([40, 360, 1200, 404], fill=(30, 90, 160))
256
+ for x, t in [(56, "IMG"), (160, "DESCRIPTION"), (700, "QTY"), (840, "UNIT USD"), (1040, "AMOUNT")]:
257
+ d.text((x, 370), t, font=h3, fill=(255, 255, 255))
258
+ y = 420
259
+ for i, (desc, qty, unit, tot) in enumerate(PO_ITEMS):
260
+ off = [-14, 22, 6][i] # column misalignment per row
261
+ im.paste(_thumb(i), (52, y))
262
+ d.text((160 + off, y + 24), desc, font=body, fill=(25, 25, 30))
263
+ d.text((706 + off // 2, y + 24), str(qty), font=body, fill=(25, 25, 30))
264
+ d.text((846 - off, y + 24), f"{unit:,.2f}", font=body, fill=(25, 25, 30))
265
+ d.text((1042 + off, y + 24), f"{tot:,.2f}", font=body, fill=(25, 25, 30))
266
+ d.line([(40, y + 88), (1200, y + 88)], fill=(210, 210, 215))
267
+ y += 96
268
+ # totals (right) + barcode (left) + signature
269
+ d.text((840, y + 24), "Subtotal:", font=h3, fill=(90, 90, 100)); d.text((1042, y + 24), "9,600.00", font=body, fill=(20, 20, 30))
270
+ d.text((840, y + 58), "Tax 8.25%:", font=h3, fill=(90, 90, 100)); d.text((1042, y + 58), "792.00", font=body, fill=(20, 20, 30))
271
+ d.rectangle([820, y + 92, 1200, y + 134], fill=(240, 244, 250))
272
+ d.text((840, y + 100), "TOTAL:", font=h2, fill=(30, 90, 160)); d.text((1042, y + 100), "10,392.00 USD", font=h2, fill=(30, 90, 160))
273
+ bx = 40
274
+ g = random.Random(5)
275
+ for _ in range(60):
276
+ wbar = g.choice((2, 2, 3, 5))
277
+ d.rectangle([bx, y + 40, bx + wbar, y + 110], fill=0)
278
+ bx += wbar + g.choice((2, 3))
279
+ d.text((40, y + 116), "*PO77RX3309*", font=small, fill=(60, 60, 60))
280
+ sig = signature()
281
+ im.paste(sig, (760, H - 220), sig)
282
+ d.line([(740, H - 160), (1010, H - 160)], fill=(60, 60, 60), width=2)
283
+ d.text((740, H - 150), "Authorized — K. Lindqvist, Procurement", font=small, fill=(60, 60, 60))
284
+ # green APPROVED stamp overlapping the table
285
+ st = stamp("APPROVED · OPS DESK", (20, 130, 60), 12)
286
+ im.paste(st, (430, 560), st)
287
+ # mild scan noise + tiny skew
288
+ a = np.asarray(im).astype(np.float32) + np.random.default_rng(3).normal(0, 5, (H, W, 3))
289
+ im = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).rotate(-0.7, expand=False, fillcolor=(252, 252, 250))
290
+ im.save(OUT / "extreme_po_collage.png")
291
+ txt = ["PURCHASE ORDER", "Nordic Fixture Works AB", "Industrigatan 14, Malmo SE",
292
+ "PO Number: PO-77RX-3309", "Order Date: 2026-05-21", "Delivery Date: 2026-06-15",
293
+ "Payment Terms: Net 45", "Currency: USD",
294
+ "Buyer: Aperture Retail Group", "Ship To: DC-7, 4420 Logistics Pkwy, Columbus OH",
295
+ "IMG DESCRIPTION QTY UNIT USD AMOUNT"] + [
296
+ f"{desc} {q} {u:,.2f} {t:,.2f}" for desc, q, u, t in PO_ITEMS] + [
297
+ "Subtotal: 9,600.00", "Tax 8.25%: 792.00", "TOTAL: 10,392.00 USD",
298
+ "*PO77RX3309*", "APPROVED · OPS DESK", "Authorized — K. Lindqvist, Procurement"]
299
+ (OUT / "extreme_po_collage.txt").write_text("\n".join(txt) + "\n")
300
+ (OUT / "extreme_po_collage.gt.json").write_text(json.dumps(PO_GT, indent=2))
301
+
302
+
303
+ # ── 3. extreme_contract_fax ───────────────────────────────────────────────────
304
+ C_GT = {
305
+ "doc_type": "contract",
306
+ "contract_number": "MSA-2026-0481",
307
+ "title": "Master Services Agreement - Store Fit-Out Program",
308
+ "party_a": "Aperture Retail Group",
309
+ "party_b": "Halcyon Build Partners LLC",
310
+ "effective_date": "2026-03-01",
311
+ "expiration_date": "2029-02-28",
312
+ "contract_value": 1250000.00,
313
+ "currency": "USD",
314
+ "governing_law": "State of Ohio",
315
+ "auto_renew": False,
316
+ "termination_notice_days": 60,
317
+ "_meta": {"doc_type": "contract", "channel": "fax", "difficulty": "extreme", "skip_eval": True},
318
+ }
319
+ C_HEAD = [
320
+ "MASTER SERVICES AGREEMENT - STORE FIT-OUT PROGRAM",
321
+ "Contract No: MSA-2026-0481",
322
+ "Party A: Aperture Retail Group Party B: Halcyon Build Partners LLC",
323
+ "Effective Date: 2026-03-01 Expiration Date: 2029-02-28",
324
+ "Total Contract Value: USD 1,250,000.00 Governing Law: State of Ohio",
325
+ "Auto-Renewal: NO Termination Notice: 60 days written notice",
326
+ ]
327
+ C_BODY = [
328
+ "1. SCOPE. Contractor shall furnish all labor, materials, supervision and",
329
+ "equipment required for the fit-out of retail premises identified in each",
330
+ "Statement of Work executed under this Agreement.",
331
+ "2. TERM. This Agreement commences on the Effective Date and continues",
332
+ "until the Expiration Date unless terminated earlier per Section 9.",
333
+ "3. COMPENSATION. Client shall pay Contractor fees not to exceed the",
334
+ "Total Contract Value, payable per approved milestone invoices Net 30.",
335
+ "4. CHANGE ORDERS. No variation is binding unless documented in a",
336
+ "written change order signed by both parties' authorized representatives.",
337
+ "5. WARRANTIES. Contractor warrants workmanship free of defects for",
338
+ "twenty-four (24) months following practical completion of each site.",
339
+ "6. INSURANCE. Contractor shall maintain commercial general liability",
340
+ "coverage of not less than USD 5,000,000 per occurrence.",
341
+ "7. CONFIDENTIALITY. Each party shall protect Confidential Information",
342
+ "with no less than reasonable care and use it solely for this Agreement.",
343
+ "8. LIABILITY. Neither party is liable for indirect or consequential",
344
+ "damages; aggregate liability is capped at the Total Contract Value.",
345
+ "9. TERMINATION. Either party may terminate for convenience upon sixty",
346
+ "(60) days written notice, or immediately for uncured material breach.",
347
+ "10. GOVERNING LAW. This Agreement is governed by the laws of the",
348
+ "State of Ohio, excluding its conflict of law provisions.",
349
+ ]
350
+
351
+
352
+ def gen_contract():
353
+ W, H = 1240, 1600
354
+ im = Image.new("RGB", (W, H), (255, 255, 255))
355
+ d = ImageDraw.Draw(im)
356
+ fh, fb, fs = font(26, True), font(17), font(14)
357
+ d.text((30, 18), "FAX TX 06/12/2026 14:22 FROM HALCYON BUILD +1 614 555 0188 P.01/07", font=fs, fill=(60, 60, 60))
358
+ d.line([(30, 44), (1210, 44)], fill=(60, 60, 60), width=2)
359
+ tw = d.textlength(C_HEAD[0], font=fh)
360
+ d.text(((W - tw) / 2, 70), C_HEAD[0], font=fh, fill=(15, 15, 15))
361
+ y = 130
362
+ for ln in C_HEAD[1:]:
363
+ d.text((80, y), ln, font=fb, fill=(20, 20, 20))
364
+ y += 30
365
+ d.line([(60, y + 8), (1180, y + 8)], fill=(120, 120, 120), width=2)
366
+ # dense two-column body
367
+ half = (len(C_BODY) + 1) // 2
368
+ for col, lines in enumerate((C_BODY[:half], C_BODY[half:])):
369
+ x = 70 + col * 590
370
+ yy = y + 34
371
+ for ln in lines:
372
+ d.text((x, yy), ln, font=fs, fill=(25, 25, 25))
373
+ yy += 24
374
+ for extra in range(14): # filler legalese to densify
375
+ d.text((x, yy), f"{'WHEREAS the parties acknowledge the recitals set forth herein;'[: 58 - (extra % 3) * 4]}",
376
+ font=fs, fill=(45, 45, 45))
377
+ yy += 24
378
+ # signature block
379
+ sy = H - 300
380
+ for col, (name, role) in enumerate([("M. Okafor — Aperture Retail Group", "Chief Procurement Officer"),
381
+ ("D. Reyes — Halcyon Build Partners LLC", "Managing Partner")]):
382
+ x = 90 + col * 600
383
+ sig = signature(color=(20, 20, 20))
384
+ im.paste(sig, (x, sy), sig)
385
+ d.line([(x, sy + 70), (x + 420, sy + 70)], fill=(40, 40, 40), width=2)
386
+ d.text((x, sy + 80), name, font=fs, fill=(30, 30, 30))
387
+ d.text((x, sy + 102), role, font=fs, fill=(90, 90, 90))
388
+ # handwritten blue margin note + red stamp
389
+ note = Image.new("RGBA", (430, 60), (0, 0, 0, 0))
390
+ ImageDraw.Draw(note).text((0, 8), "legal OK -> route to CFO (June 5)", font=font(24), fill=(28, 40, 160, 220))
391
+ note = note.rotate(-3, expand=True, resample=Image.BICUBIC)
392
+ im.paste(note, (700, 360), note)
393
+ st = stamp("RECEIVED JUN 05 2026", (180, 30, 30), -14, size=(420, 110), fsz=30)
394
+ im.paste(st, (90, 430), st)
395
+ # fax degradation: low contrast, salt & pepper, scanline streaks, skew, punch holes
396
+ g = im.convert("L")
397
+ a = np.asarray(g).astype(np.float32)
398
+ a = 255 - (255 - a) * 0.62 # washed-out toner
399
+ nz = np.random.default_rng(11)
400
+ a += nz.normal(0, 9, a.shape)
401
+ pepper = nz.random(a.shape)
402
+ a[pepper < 0.004] = 30 # pepper
403
+ a[pepper > 0.997] = 245 # salt
404
+ for yy in range(0, H, 90): # scanline streaks
405
+ a[yy:yy + 2, :] = np.clip(a[yy:yy + 2, :] * 1.25, 0, 255)
406
+ img = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).rotate(1.3, expand=False, fillcolor=235)
407
+ d2 = ImageDraw.Draw(img)
408
+ for hy in (H // 4, 3 * H // 4): # punch-hole shadows
409
+ d2.ellipse([18, hy - 22, 62, hy + 22], fill=246, outline=140, width=3)
410
+ img.convert("RGB").save(OUT / "extreme_contract_fax.png")
411
+ (OUT / "extreme_contract_fax.txt").write_text("\n".join(C_HEAD + C_BODY) + "\n")
412
+ (OUT / "extreme_contract_fax.gt.json").write_text(json.dumps(C_GT, indent=2))
413
+
414
+
415
+ if __name__ == "__main__":
416
+ OUT.mkdir(parents=True, exist_ok=True)
417
+ gen_receipt()
418
+ gen_po()
419
+ gen_contract()
420
+ for sid in ("extreme_receipt_photo", "extreme_po_collage", "extreme_contract_fax"):
421
+ print(f" wrote {OUT / sid}.png (+ .gt.json + .txt)")
scripts/ocr_quality.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run the OCR output-quality + document-analysis benchmark across all available
3
+ backends (OpenBMB MiniCPM-V, Cohere Aya-Vision, Tesseract, sidecar) and PUBLISH the
4
+ results.
5
+
6
+ python scripts/ocr_quality.py
7
+
8
+ Writes:
9
+ backend/evals/ocr_quality_report.json (committed, tracked)
10
+ <writable>/metrics_snapshots/ocr_quality_<ts>.json (published snapshot)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import sys
16
+ import time
17
+ from pathlib import Path
18
+
19
+ ROOT = Path(__file__).resolve().parent.parent
20
+ sys.path.insert(0, str(ROOT / "backend"))
21
+
22
+ from app.config import get_settings # noqa: E402
23
+ from app.db import Database # noqa: E402
24
+ from app.metrics import MetricsStore # noqa: E402
25
+ from app.ocr.backends import build_ocr_registry # noqa: E402
26
+ from app.ocr.quality import run_ocr_quality # noqa: E402
27
+ from app.providers import build_registry # noqa: E402
28
+ from app.rag_store import VectorStore # noqa: E402
29
+ from app.router import ModelRouter # noqa: E402
30
+
31
+ REPORT = ROOT / "backend" / "evals" / "ocr_quality_report.json"
32
+
33
+
34
+ def main() -> None:
35
+ s = get_settings()
36
+ metrics = MetricsStore(s.metrics_db_path)
37
+ router = ModelRouter(build_registry(s), s, metrics)
38
+ ocr = build_ocr_registry(s)
39
+ db = Database(s.app_db_path)
40
+ rag = VectorStore(s.rag_db_path)
41
+
42
+ report = run_ocr_quality(s, ocr, router, metrics, db=db, rag_store=rag)
43
+ REPORT.write_text(json.dumps(report, indent=2))
44
+ snap_dir = s.writable_dir / "metrics_snapshots"
45
+ snap_dir.mkdir(parents=True, exist_ok=True)
46
+ (snap_dir / f"ocr_quality_{time.strftime('%Y%m%dT%H%M%S')}.json").write_text(json.dumps(report, indent=2))
47
+
48
+ pct = lambda v: "n/a" if v is None else f"{v*100:.1f}%"
49
+ print("\n" + "=" * 90)
50
+ print(" OCR OUTPUT QUALITY + DOCUMENT ANALYSIS (smaller CER/WER = better; higher field-acc = better)")
51
+ print("=" * 90)
52
+ print(f" {'backend':<11}{'model':<17}{'params':>7}{'CER':>8}{'WER':>8}{'field-exact':>13}{'F1':>8}{'lat(ms)':>9}{'$/doc':>9}")
53
+ print("-" * 90)
54
+ for r in report["backends"]:
55
+ params = f"{r['params_b']}B" if r.get("params_b") else "—"
56
+ print(f" {r['backend']:<11}{(r.get('model') or '')[:16]:<17}{params:>7}"
57
+ f"{pct(r['cer']):>8}{pct(r['wer']):>8}{pct(r['field_exact_match']):>13}"
58
+ f"{pct(r['field_f1']):>8}{(r['avg_latency_ms'] or 0):>9.0f}{(r['avg_cost_usd'] or 0):>9.5f}")
59
+ print("-" * 90)
60
+ print(f" best OCR text quality : {report['best_ocr_quality']}")
61
+ print(f" best document analysis : {report['best_document_analysis']}")
62
+ print(f" published → {REPORT}")
63
+ print("=" * 90 + "\n")
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
scripts/ocr_smoke.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run every available OCR backend against real scanned samples and write a
3
+ tracked report (backend/evals/ocr_backend_report.json).
4
+
5
+ python scripts/ocr_smoke.py
6
+
7
+ Reads backend/.env, so configured backends (e.g. MiniCPM) are exercised live.
8
+ Unavailable backends (missing deps/keys) are recorded with the reason.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ ROOT = Path(__file__).resolve().parent.parent
17
+ sys.path.insert(0, str(ROOT / "backend"))
18
+
19
+ from app.config import get_settings # noqa: E402
20
+ from app.ocr.backends import build_ocr_registry # noqa: E402
21
+ from app.ocr.backends.healthcheck import run_ocr_backend_tests # noqa: E402
22
+
23
+ REPORT_PATH = ROOT / "backend" / "evals" / "ocr_backend_report.json"
24
+
25
+
26
+ def main() -> None:
27
+ s = get_settings()
28
+ reg = build_ocr_registry(s)
29
+ report = run_ocr_backend_tests(s, reg)
30
+ REPORT_PATH.write_text(json.dumps(report, indent=2))
31
+
32
+ print("\n" + "=" * 78)
33
+ print(f" OCR BACKEND REAL-EXTRACTION REPORT (mode={report['mode']})")
34
+ print("=" * 78)
35
+ print(f" {'backend':<12}{'tier':<8}{'available':<11}{'functional':<11}{'engine / reason'}")
36
+ print("-" * 78)
37
+ for b in report["backends"]:
38
+ if b["available"]:
39
+ case = b["cases"][0] if b["cases"] else {}
40
+ detail = f"{case.get('engine','')} ({case.get('chars',0)} chars, {case.get('latency_ms',0)}ms)"
41
+ func = "✓ yes" if b["ok"] else "✗ no"
42
+ else:
43
+ detail = b["requires"]
44
+ func = "—"
45
+ print(f" {b['name']:<12}{b['tier']:<8}{('yes' if b['available'] else 'no'):<11}{func:<11}{detail[:42]}")
46
+ print("-" * 78)
47
+ print(f" available : {report['available_backends']}")
48
+ print(f" functional: {report['functional_backends']}")
49
+ print(f" report → {REPORT_PATH}")
50
+ print("=" * 78 + "\n")
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
scripts/run_dev.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Launch the Aperture backend (FastAPI) and frontend (Vite) together.
3
+ set -euo pipefail
4
+
5
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
6
+ cd "$ROOT"
7
+
8
+ echo "▶ Aperture dev launcher"
9
+
10
+ # 1) ensure samples exist
11
+ if [ ! -f backend/evals/datasets/invoice_acme_digital.gt.json ]; then
12
+ echo " · generating sample corpus…"
13
+ python3 scripts/generate_samples.py >/dev/null
14
+ fi
15
+
16
+ # 2) backend
17
+ echo " · starting backend on :8000"
18
+ ( cd backend && uvicorn app.main:app --port 8000 --reload ) &
19
+ BACK=$!
20
+
21
+ # 3) frontend
22
+ if [ ! -d frontend/node_modules ]; then
23
+ echo " · installing frontend deps (first run)…"
24
+ ( cd frontend && npm install --silent )
25
+ fi
26
+ echo " · starting frontend on :5173"
27
+ ( cd frontend && npm run dev ) &
28
+ FRONT=$!
29
+
30
+ trap 'echo; echo "stopping…"; kill $BACK $FRONT 2>/dev/null || true' INT TERM
31
+ echo
32
+ echo " backend: http://localhost:8000/docs"
33
+ echo " frontend: http://localhost:5173"
34
+ echo " (Ctrl-C to stop)"
35
+ wait
scripts/test_ocr.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Quick OCR backend tester.
3
+
4
+ python scripts/test_ocr.py <sample_id_or_path> [--backend auto|minicpm|cohere|llamaparse|tesseract|easyocr|sidecar]
5
+
6
+ Examples:
7
+ python scripts/test_ocr.py invoice_scanned_basic
8
+ python scripts/test_ocr.py invoice_scanned_basic --backend minicpm
9
+ python scripts/test_ocr.py /path/to/receipt.png --backend cohere
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ ROOT = Path(__file__).resolve().parent.parent
18
+ sys.path.insert(0, str(ROOT / "backend"))
19
+
20
+ from app.config import get_settings # noqa: E402
21
+ from app.ocr.backends import build_ocr_registry # noqa: E402
22
+
23
+
24
+ def resolve(arg: str, settings) -> Path | None:
25
+ p = Path(arg)
26
+ if p.exists():
27
+ return p
28
+ for ext in (".pdf", ".png", ".jpg", ".jpeg"):
29
+ cand = settings.evals_dataset_dir / f"{arg}{ext}"
30
+ if cand.exists():
31
+ return cand
32
+ return None
33
+
34
+
35
+ def main() -> None:
36
+ ap = argparse.ArgumentParser()
37
+ ap.add_argument("doc")
38
+ ap.add_argument("--backend", default="auto")
39
+ args = ap.parse_args()
40
+
41
+ s = get_settings()
42
+ path = resolve(args.doc, s)
43
+ if not path:
44
+ print(f"not found: {args.doc}")
45
+ sys.exit(1)
46
+ reg = build_ocr_registry(s)
47
+ print(f"available backends: {reg.available_names()}")
48
+ res, attempts = reg.extract(path, args.backend)
49
+ print(f"\nattempts: {attempts}")
50
+ print(f"\nengine={res.engine} tier={res.tier} pages={res.pages} "
51
+ f"conf={res.confidence} chars={len(res.text)} simulated={res.simulated}")
52
+ print("\n--- text (first 1200 chars) ---")
53
+ print(res.text[:1200])
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()