""" Universal Parser — routes file bytes to the correct handler based on category + file_type provided by the user via the UI. """ from file_handlers.unstructured_handlers import UNSTRUCTURED_PARSERS from file_handlers.semi_structured_handlers import SEMI_STRUCTURED_PARSERS from file_handlers.structured_handlers import STRUCTURED_PARSERS # Master registry: file_type → parser function _ALL_PARSERS = { **UNSTRUCTURED_PARSERS, **SEMI_STRUCTURED_PARSERS, **STRUCTURED_PARSERS, } # Category → allowed file types (matches frontend UI options) CATEGORY_FILE_TYPES = { "unstructured": [ {"ext": "txt", "label": "Plain Text (.txt)"}, {"ext": "pdf", "label": "PDF Document (.pdf)"}, {"ext": "docx", "label": "Word Document (.docx)"}, {"ext": "rtf", "label": "Rich Text Format (.rtf)"}, {"ext": "odt", "label": "OpenDocument Text (.odt)"}, {"ext": "md", "label": "Markdown (.md)"}, {"ext": "log", "label": "Log File (.log)"}, {"ext": "eml", "label": "Email (.eml)"}, {"ext": "epub", "label": "eBook (.epub)"}, {"ext": "pptx", "label": "PowerPoint (.pptx)"}, # Video formats — processed via async job queue, not universal_parser {"ext": "mp4", "label": "🎬 MP4 Video (.mp4)", "async": True}, {"ext": "mkv", "label": "🎬 MKV Video (.mkv)", "async": True}, {"ext": "avi", "label": "🎬 AVI Video (.avi)", "async": True}, {"ext": "mov", "label": "🎬 MOV Video (.mov)", "async": True}, {"ext": "webm", "label": "🎬 WebM Video (.webm)", "async": True}, ], "semi_structured": [ {"ext": "json", "label": "JSON (.json)"}, {"ext": "xml", "label": "XML (.xml)"}, {"ext": "yaml", "label": "YAML (.yaml / .yml)"}, {"ext": "html", "label": "HTML (.html)"}, {"ext": "csv", "label": "CSV (.csv)"}, {"ext": "tsv", "label": "TSV (.tsv)"}, {"ext": "ini", "label": "INI Config (.ini)"}, {"ext": "toml", "label": "TOML (.toml)"}, {"ext": "config", "label": "Config File (.config / .cfg)"}, {"ext": "edifact", "label": "EDIFACT (.edi)"}, ], "structured": [ {"ext": "sqlite", "label": "SQLite Database (.sqlite / .db)"}, {"ext": "sql", "label": "SQL Script (.sql)"}, {"ext": "parquet", "label": "Parquet (.parquet)"}, {"ext": "avro", "label": "Avro (.avro)"}, {"ext": "xlsx", "label": "Excel 2007+ (.xlsx)"}, {"ext": "xls", "label": "Excel Legacy (.xls)"}, {"ext": "orc", "label": "ORC (.orc)"}, {"ext": "hdf5", "label": "HDF5 (.hdf5 / .h5)"}, {"ext": "feather", "label": "Feather / Arrow (.feather)"}, {"ext": "dta", "label": "Stata Dataset (.dta)"}, ], } import re def normalize_text(text: str) -> str: """ Cleans extracted text to improve NLP processing accuracy. - Removes zero-width characters and invisible control characters. - Normalizes repeated whitespaces (except newlines). - Repairs broken sentences caused by PDF layout extraction. """ if not text: return "" # 1. Remove zero-width characters and non-printable control chars (excluding \n, \t) text = re.sub(r'[\u200b\u200c\u200d\uFEFF\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # 2. Collapse multiple spaces/tabs into a single space text = re.sub(r'[ \t]+', ' ', text) # 3. Repair broken PDF lines: If a line does not end with punctuation or is short, join it. # We replace a newline that is preceded by a lowercase letter and followed by a letter with a space. text = re.sub(r'([a-z])\n([A-Za-z])', r'\1 \2', text) return text.strip() def parse_file(file_bytes: bytes, file_type: str) -> str: """ Parse uploaded file bytes into plain text. Args: file_bytes: Raw bytes from the uploaded file. file_type: Extension key e.g. 'docx', 'parquet', 'yaml'. Returns: Extracted plain text string (may be long; caller truncates if needed). """ file_type = file_type.lower().lstrip(".") parser_fn = _ALL_PARSERS.get(file_type) if parser_fn is None: # Best-effort: try UTF-8 text decode try: raw_text = file_bytes.decode("utf-8", errors="replace") except Exception: return f"[No parser for file type: {file_type}]" else: raw_text = parser_fn(file_bytes) return normalize_text(raw_text) def get_all_categories() -> dict: """Return category → file type options for the frontend UI.""" return CATEGORY_FILE_TYPES