"""
Universal Parser — routes file bytes to the correct handler
based on category + file_type provided by the user via the UI.
"""
from file_handlers.unstructured_handlers import UNSTRUCTURED_PARSERS
from file_handlers.semi_structured_handlers import SEMI_STRUCTURED_PARSERS
from file_handlers.structured_handlers import STRUCTURED_PARSERS

# Master registry: file_type → parser function
_ALL_PARSERS = {
    **UNSTRUCTURED_PARSERS,
    **SEMI_STRUCTURED_PARSERS,
    **STRUCTURED_PARSERS,
}

# Category → allowed file types (matches frontend UI options)
CATEGORY_FILE_TYPES = {
    "unstructured": [
        {"ext": "txt",  "label": "Plain Text (.txt)"},
        {"ext": "pdf",  "label": "PDF Document (.pdf)"},
        {"ext": "docx", "label": "Word Document (.docx)"},
        {"ext": "rtf",  "label": "Rich Text Format (.rtf)"},
        {"ext": "odt",  "label": "OpenDocument Text (.odt)"},
        {"ext": "md",   "label": "Markdown (.md)"},
        {"ext": "log",  "label": "Log File (.log)"},
        {"ext": "eml",  "label": "Email (.eml)"},
        {"ext": "epub", "label": "eBook (.epub)"},
        {"ext": "pptx", "label": "PowerPoint (.pptx)"},
        # Video formats — processed via async job queue, not universal_parser
        {"ext": "mp4",  "label": "🎬 MP4 Video (.mp4)",  "async": True},
        {"ext": "mkv",  "label": "🎬 MKV Video (.mkv)",  "async": True},
        {"ext": "avi",  "label": "🎬 AVI Video (.avi)",  "async": True},
        {"ext": "mov",  "label": "🎬 MOV Video (.mov)",  "async": True},
        {"ext": "webm", "label": "🎬 WebM Video (.webm)", "async": True},
    ],
    "semi_structured": [
        {"ext": "json",    "label": "JSON (.json)"},
        {"ext": "xml",     "label": "XML (.xml)"},
        {"ext": "yaml",    "label": "YAML (.yaml / .yml)"},
        {"ext": "html",    "label": "HTML (.html)"},
        {"ext": "csv",     "label": "CSV (.csv)"},
        {"ext": "tsv",     "label": "TSV (.tsv)"},
        {"ext": "ini",     "label": "INI Config (.ini)"},
        {"ext": "toml",    "label": "TOML (.toml)"},
        {"ext": "config",  "label": "Config File (.config / .cfg)"},
        {"ext": "edifact", "label": "EDIFACT (.edi)"},
    ],
    "structured": [
        {"ext": "sqlite",  "label": "SQLite Database (.sqlite / .db)"},
        {"ext": "sql",     "label": "SQL Script (.sql)"},
        {"ext": "parquet", "label": "Parquet (.parquet)"},
        {"ext": "avro",    "label": "Avro (.avro)"},
        {"ext": "xlsx",    "label": "Excel 2007+ (.xlsx)"},
        {"ext": "xls",     "label": "Excel Legacy (.xls)"},
        {"ext": "orc",     "label": "ORC (.orc)"},
        {"ext": "hdf5",    "label": "HDF5 (.hdf5 / .h5)"},
        {"ext": "feather", "label": "Feather / Arrow (.feather)"},
        {"ext": "dta",     "label": "Stata Dataset (.dta)"},
    ],
}

import re

def normalize_text(text: str) -> str:
    """
    Cleans extracted text to improve NLP processing accuracy.
    - Removes zero-width characters and invisible control characters.
    - Normalizes repeated whitespaces (except newlines).
    - Repairs broken sentences caused by PDF layout extraction.
    """
    if not text:
        return ""
    
    # 1. Remove zero-width characters and non-printable control chars (excluding \n, \t)
    text = re.sub(r'[\u200b\u200c\u200d\uFEFF\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    
    # 2. Collapse multiple spaces/tabs into a single space
    text = re.sub(r'[ \t]+', ' ', text)
    
    # 3. Repair broken PDF lines: If a line does not end with punctuation or is short, join it.
    # We replace a newline that is preceded by a lowercase letter and followed by a letter with a space.
    text = re.sub(r'([a-z])\n([A-Za-z])', r'\1 \2', text)
    
    return text.strip()

def parse_file(file_bytes: bytes, file_type: str) -> str:
    """
    Parse uploaded file bytes into plain text.

    Args:
        file_bytes: Raw bytes from the uploaded file.
        file_type:  Extension key e.g. 'docx', 'parquet', 'yaml'.

    Returns:
        Extracted plain text string (may be long; caller truncates if needed).
    """
    file_type = file_type.lower().lstrip(".")
    parser_fn = _ALL_PARSERS.get(file_type)

    if parser_fn is None:
        # Best-effort: try UTF-8 text decode
        try:
            raw_text = file_bytes.decode("utf-8", errors="replace")
        except Exception:
            return f"[No parser for file type: {file_type}]"
    else:
        raw_text = parser_fn(file_bytes)
        
    return normalize_text(raw_text)


def get_all_categories() -> dict:
    """Return category → file type options for the frontend UI."""
    return CATEGORY_FILE_TYPES