"""
Semi-Structured File Handlers
Extracts plain text from: json, xml, yaml, html, csv, tsv, ini, toml, config, edifact
"""
import io
import json as json_lib


def parse_json(file_bytes: bytes) -> str:
    """JSON — flatten all string values into lines."""
    try:
        data = json_lib.loads(file_bytes.decode("utf-8", errors="replace"))
        lines = []
        _flatten_json(data, lines)
        return "\n".join(lines)
    except Exception as e:
        return file_bytes.decode("utf-8", errors="replace")


def _flatten_json(obj, lines: list, path: str = "", depth: int = 0):
    """Recursively extract all key-value pairs from JSON with context injection."""
    if depth > 10:
        return
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_path = f"{path}.{k}" if path else str(k)
            _flatten_json(v, lines, new_path, depth + 1)
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            new_path = f"{path}[{i}]"
            _flatten_json(item, lines, new_path, depth + 1)
    elif obj is not None and str(obj).strip():
        # Context Injection: Bind the JSON key path directly to the value
        lines.append(f"[{path}]: {obj}")


def parse_xml(file_bytes: bytes) -> str:
    """XML — extract all text nodes."""
    try:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(file_bytes, "lxml-xml")
        return soup.get_text(separator="\n")
    except Exception:
        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(file_bytes, "lxml")
            return soup.get_text(separator="\n")
        except Exception as e:
            return file_bytes.decode("utf-8", errors="replace")


def parse_yaml(file_bytes: bytes) -> str:
    """YAML — flatten all scalar values."""
    try:
        import yaml
        data = yaml.safe_load(file_bytes.decode("utf-8", errors="replace"))
        lines = []
        _flatten_json(data, lines)
        return "\n".join(lines)
    except Exception:
        return file_bytes.decode("utf-8", errors="replace")


def parse_html(file_bytes: bytes) -> str:
    """HTML — extract visible text."""
    try:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(file_bytes, "lxml")
        # Remove script and style elements
        for tag in soup(["script", "style", "meta", "link"]):
            tag.decompose()
        return soup.get_text(separator="\n")
    except Exception as e:
        import re
        text = file_bytes.decode("utf-8", errors="replace")
        return re.sub(r"<[^>]+>", " ", text)


def parse_csv(file_bytes: bytes) -> str:
    """CSV — use structured context injection."""
    try:
        import pandas as pd
        from file_handlers.structured_handlers import _df_to_text
        df = pd.read_csv(io.BytesIO(file_bytes))
        return _df_to_text(df)
    except Exception as e:
        return file_bytes.decode("utf-8", errors="replace")


def parse_tsv(file_bytes: bytes) -> str:
    """TSV — use structured context injection."""
    try:
        import pandas as pd
        from file_handlers.structured_handlers import _df_to_text
        df = pd.read_csv(io.BytesIO(file_bytes), sep="\t")
        return _df_to_text(df)
    except Exception as e:
        return file_bytes.decode("utf-8", errors="replace")


def parse_ini(file_bytes: bytes) -> str:
    """INI/Config — extract all key-value pairs as text."""
    try:
        import configparser
        config = configparser.ConfigParser()
        text = file_bytes.decode("utf-8", errors="replace")
        config.read_string(text)
        lines = []
        for section in config.sections():
            lines.append(f"[{section}]")
            for key, val in config.items(section):
                lines.append(f"{key} = {val}")
        return "\n".join(lines) if lines else text
    except Exception:
        return file_bytes.decode("utf-8", errors="replace")


def parse_toml(file_bytes: bytes) -> str:
    """TOML — flatten all values to text."""
    try:
        import sys
        text = file_bytes.decode("utf-8", errors="replace")
        if sys.version_info >= (3, 11):
            import tomllib
            data = tomllib.loads(text)
        else:
            import tomli
            data = tomli.loads(text)
        lines = []
        _flatten_json(data, lines)
        return "\n".join(lines)
    except Exception:
        return file_bytes.decode("utf-8", errors="replace")


def parse_config(file_bytes: bytes) -> str:
    """Config files — treat as ini first, fallback to plain text."""
    result = parse_ini(file_bytes)
    if result.startswith("["):
        return result
    return file_bytes.decode("utf-8", errors="replace")


def parse_edifact(file_bytes: bytes) -> str:
    """EDIFACT — try pydifact, fallback to raw text extraction."""
    try:
        from pydifact.segmentcollection import RawSegmentCollection
        text = file_bytes.decode("utf-8", errors="replace")
        collection = RawSegmentCollection.from_str(text)
        lines = []
        for segment in collection:
            # Each segment: tag + elements
            parts = [segment.tag] + [
                str(e) for e in (segment.elements or [])
            ]
            lines.append(" ".join(parts))
        return "\n".join(lines)
    except Exception:
        # Fallback: raw text — EDI has PII in fields like NAD segments
        text = file_bytes.decode("utf-8", errors="replace")
        import re
        # Replace segment terminators with newlines for readability
        text = re.sub(r"'", "\n", text)
        text = re.sub(r"\+", " ", text)
        return text


SEMI_STRUCTURED_PARSERS = {
    "json":    parse_json,
    "xml":     parse_xml,
    "yaml":    parse_yaml,
    "yml":     parse_yaml,
    "html":    parse_html,
    "htm":     parse_html,
    "csv":     parse_csv,
    "tsv":     parse_tsv,
    "ini":     parse_ini,
    "toml":    parse_toml,
    "config":  parse_config,
    "cfg":     parse_config,
    "conf":    parse_config,
    "edifact": parse_edifact,
    "edi":     parse_edifact,
}