""" Hugging Face Spaces deployment profile. PRIVATE — this module is intentionally NOT part of the public repository whitelist (sync_to_clean.sh). It lives only in the GitLab repo and is copied into the Space deployment by scripts/publish_hf_space.ps1. It is loaded at runtime via the generic ``POLYSCRIPTOR_PROFILE=hf_space_demo`` environment variable (see ``htr_engine_base.load_runtime_profile``) and constrains the app to the hosted CPU demo: - CRNN-CTC, TrOCR, Kraken, Commercial APIs, OpenWebUI engines. - Four public ``achimrabus/*`` CRNN-CTC presets (auto-downloaded from HF). - CPU-only inference (no GPU on Spaces). It implements the optional runtime-profile hooks consumed by the core modules: discover_engines(registry) -> bool register_pylaia_models(models) -> bool segmentation_overrides(method, device) -> (method, device) run_segmentation(img_data, method, device) -> Optional[dict] """ from types import SimpleNamespace from typing import Optional import numpy as np # Public CRNN-CTC presets, downloaded from Hugging Face model repos on demand # (handled generically by PyLaiaEngine when a preset carries a ``repo_id``). _DEMO_MODELS = { "Ukrainian (HF, 4.76% CER)": { "repo_id": "achimrabus/crnn-ctc-ukrainian", "checkpoint": "best_model.pt", "syms": "symbols.txt", "description": "Public Hugging Face CRNN-CTC model for Ukrainian HTR", }, "Prosta Mova (HF, 3.77% CER)": { "repo_id": "achimrabus/crnn-ctc-prosta-mova", "checkpoint": "best_model.pt", "syms": "symbols.txt", "description": "Public Hugging Face CRNN-CTC model for Prosta Mova HTR", }, "Church Slavonic (HF, 2.89% CER)": { "repo_id": "achimrabus/crnn-ctc-church-slavonic", "checkpoint": "best_model.pt", "syms": "symbols.txt", "description": "Public Hugging Face CRNN-CTC model for Church Slavonic HTR", }, "Glagolitic (HF, 5.33% CER)": { "repo_id": "achimrabus/crnn-ctc-glagolitic", "checkpoint": "best_model.pt", "syms": "symbols.txt", "description": "Public Hugging Face CRNN-CTC model for Glagolitic HTR", }, } def discover_engines(registry) -> bool: """Register engines available in the HF Space demo. Returns True to claim discovery.""" for cls_path, label in [ ("engines.pylaia_engine.PyLaiaEngine", "CRNN-CTC"), ("engines.trocr_engine.TrOCREngine", "TrOCR"), ("engines.kraken_engine.KrakenEngine", "Kraken"), ("engines.commercial_api_engine.CommercialAPIEngine", "Commercial APIs"), ("engines.openwebui_engine.OpenWebUIEngine", "OpenWebUI"), ]: module_path, cls_name = cls_path.rsplit(".", 1) try: import importlib mod = importlib.import_module(module_path) registry.register(getattr(mod, cls_name)()) except Exception as e: print(f"Warning: Could not load {label} engine: {e}") return True def register_pylaia_models(models: dict) -> bool: """Replace the preset registry with the public demo models. Returns True so the core skips scanning the local ``models/`` directory (the Space ships no local checkpoints, but this keeps the list exact even when the profile is exercised on a machine that does). """ models.clear() models.update(_DEMO_MODELS) return True def segmentation_overrides(method: str, device: str): """Force CPU — HF Space has no GPU.""" return method, "cpu" def run_segmentation(img_data: dict, method: str, device: str) -> Optional[dict]: """Dependency-light horizontal-projection line segmenter for the CPU demo. Returns a serialisable result dict, or None to let the core handle it (only HPP without a PAGE XML is handled here). """ if method != "hpp" or img_data.get("xml_path") is not None: return None pil_image = img_data["pil_image"] gray = np.array(pil_image.convert("L")) if gray.size == 0: lines = [] else: threshold = min(220, max(90, float(np.percentile(gray, 42)))) ink = gray < threshold row_density = ink.mean(axis=1) kernel = np.ones(9, dtype=np.float32) / 9.0 smooth = np.convolve(row_density, kernel, mode="same") active_threshold = max(0.01, float(smooth.max()) * 0.13) min_height = max(10, int(pil_image.height * 0.008)) bands = [] start = None for y, value in enumerate(smooth): if value > active_threshold and start is None: start = y elif (value <= active_threshold or y == len(smooth) - 1) and start is not None: end = y if y == len(smooth) - 1 else y - 1 if end - start + 1 >= min_height: bands.append((start, end)) start = None lines = [] for y1, y2 in bands[:100]: pad_y = max(3, int((y2 - y1 + 1) * 0.25)) top = max(0, y1 - pad_y) bottom = min(pil_image.height, y2 + pad_y + 1) band_ink = ink[top:bottom, :] cols = np.where(band_ink.any(axis=0))[0] if cols.size: left = max(0, int(cols[0]) - 8) right = min(pil_image.width, int(cols[-1]) + 9) else: left = 0 right = pil_image.width bbox = (left, top, right, bottom) lines.append(SimpleNamespace( image=pil_image.crop(bbox), bbox=bbox, coords=None, )) img_data["lines"] = lines img_data["line_regions"] = [0] * len(lines) img_data["seg_source"] = "hpp" img_data["seg_regions"] = [] return { "num_lines": len(lines), "bboxes": [list(line.bbox) for line in lines], "source": "hpp", }