""" Semi-Structured File Handlers Extracts plain text from: json, xml, yaml, html, csv, tsv, ini, toml, config, edifact """ import io import json as json_lib def parse_json(file_bytes: bytes) -> str: """JSON — flatten all string values into lines.""" try: data = json_lib.loads(file_bytes.decode("utf-8", errors="replace")) lines = [] _flatten_json(data, lines) return "\n".join(lines) except Exception as e: return file_bytes.decode("utf-8", errors="replace") def _flatten_json(obj, lines: list, path: str = "", depth: int = 0): """Recursively extract all key-value pairs from JSON with context injection.""" if depth > 10: return if isinstance(obj, dict): for k, v in obj.items(): new_path = f"{path}.{k}" if path else str(k) _flatten_json(v, lines, new_path, depth + 1) elif isinstance(obj, list): for i, item in enumerate(obj): new_path = f"{path}[{i}]" _flatten_json(item, lines, new_path, depth + 1) elif obj is not None and str(obj).strip(): # Context Injection: Bind the JSON key path directly to the value lines.append(f"[{path}]: {obj}") def parse_xml(file_bytes: bytes) -> str: """XML — extract all text nodes.""" try: from bs4 import BeautifulSoup soup = BeautifulSoup(file_bytes, "lxml-xml") return soup.get_text(separator="\n") except Exception: try: from bs4 import BeautifulSoup soup = BeautifulSoup(file_bytes, "lxml") return soup.get_text(separator="\n") except Exception as e: return file_bytes.decode("utf-8", errors="replace") def parse_yaml(file_bytes: bytes) -> str: """YAML — flatten all scalar values.""" try: import yaml data = yaml.safe_load(file_bytes.decode("utf-8", errors="replace")) lines = [] _flatten_json(data, lines) return "\n".join(lines) except Exception: return file_bytes.decode("utf-8", errors="replace") def parse_html(file_bytes: bytes) -> str: """HTML — extract visible text.""" try: from bs4 import BeautifulSoup soup = BeautifulSoup(file_bytes, "lxml") # Remove script and style elements for tag in soup(["script", "style", "meta", "link"]): tag.decompose() return soup.get_text(separator="\n") except Exception as e: import re text = file_bytes.decode("utf-8", errors="replace") return re.sub(r"<[^>]+>", " ", text) def parse_csv(file_bytes: bytes) -> str: """CSV — use structured context injection.""" try: import pandas as pd from file_handlers.structured_handlers import _df_to_text df = pd.read_csv(io.BytesIO(file_bytes)) return _df_to_text(df) except Exception as e: return file_bytes.decode("utf-8", errors="replace") def parse_tsv(file_bytes: bytes) -> str: """TSV — use structured context injection.""" try: import pandas as pd from file_handlers.structured_handlers import _df_to_text df = pd.read_csv(io.BytesIO(file_bytes), sep="\t") return _df_to_text(df) except Exception as e: return file_bytes.decode("utf-8", errors="replace") def parse_ini(file_bytes: bytes) -> str: """INI/Config — extract all key-value pairs as text.""" try: import configparser config = configparser.ConfigParser() text = file_bytes.decode("utf-8", errors="replace") config.read_string(text) lines = [] for section in config.sections(): lines.append(f"[{section}]") for key, val in config.items(section): lines.append(f"{key} = {val}") return "\n".join(lines) if lines else text except Exception: return file_bytes.decode("utf-8", errors="replace") def parse_toml(file_bytes: bytes) -> str: """TOML — flatten all values to text.""" try: import sys text = file_bytes.decode("utf-8", errors="replace") if sys.version_info >= (3, 11): import tomllib data = tomllib.loads(text) else: import tomli data = tomli.loads(text) lines = [] _flatten_json(data, lines) return "\n".join(lines) except Exception: return file_bytes.decode("utf-8", errors="replace") def parse_config(file_bytes: bytes) -> str: """Config files — treat as ini first, fallback to plain text.""" result = parse_ini(file_bytes) if result.startswith("["): return result return file_bytes.decode("utf-8", errors="replace") def parse_edifact(file_bytes: bytes) -> str: """EDIFACT — try pydifact, fallback to raw text extraction.""" try: from pydifact.segmentcollection import RawSegmentCollection text = file_bytes.decode("utf-8", errors="replace") collection = RawSegmentCollection.from_str(text) lines = [] for segment in collection: # Each segment: tag + elements parts = [segment.tag] + [ str(e) for e in (segment.elements or []) ] lines.append(" ".join(parts)) return "\n".join(lines) except Exception: # Fallback: raw text — EDI has PII in fields like NAD segments text = file_bytes.decode("utf-8", errors="replace") import re # Replace segment terminators with newlines for readability text = re.sub(r"'", "\n", text) text = re.sub(r"\+", " ", text) return text SEMI_STRUCTURED_PARSERS = { "json": parse_json, "xml": parse_xml, "yaml": parse_yaml, "yml": parse_yaml, "html": parse_html, "htm": parse_html, "csv": parse_csv, "tsv": parse_tsv, "ini": parse_ini, "toml": parse_toml, "config": parse_config, "cfg": parse_config, "conf": parse_config, "edifact": parse_edifact, "edi": parse_edifact, }