"""Legal-Hebrew chunker. Splits on common Hebrew legal section boundaries: סעיף N, פרק N, תת-סעיף N, (א)/(ב)/(ג). Keeps each numbered unit as its own chunk and attaches the section label to metadata — so the retriever can later filter by section and the generator can cite "סעיף 5". Fallback: if no legal markers are found, falls back to sentence_chunker. """ from __future__ import annotations import re from typing import List, Tuple from ..core.types import Chunk, Document from .sentence import sentence_chunker # Match "סעיף 5", "סעיף 5א", "פרק ב", "(א)", "(1)", "תת-סעיף 3", # "תקנה 3", "פקודה 7", "חוק 12" — all common legal reference units. _SECTION_RE = re.compile( r"(?m)^\s*(?:" r"(?:סעיף|פרק|תת-סעיף|סימן|תקנה|פקודה|תקנות)\s+" r"[\u0590-\u05FF0-9\"'\.]+|" # Hebrew unit + id r"\([\u0590-\u05FF0-9]+\)|" # (א) / (1) r"\d+\.\d+\.?\s|" # 1.2 or 1.2. r"\[\d+\]" # [1] r")\s*" ) def _split_by_sections(text: str) -> List[Tuple[str, str, int]]: """Return [(label, body, start_offset), ...]. label is the section header.""" matches = list(_SECTION_RE.finditer(text)) if not matches: return [] out: List[Tuple[str, str, int]] = [] for i, m in enumerate(matches): start = m.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) header = m.group().strip() body = text[m.end():end].strip() if body: out.append((header, body, start)) return out def legal_hebrew_chunker( doc: Document, chunk_size: int = 800, min_section_chars: int = 8, ) -> List[Chunk]: """Primary: section-aware. Fallback: sentence_chunker.""" sections = _split_by_sections(doc.text) if not sections: return sentence_chunker(doc, chunk_size=chunk_size) chunks: List[Chunk] = [] idx = 0 for label, body, offset in sections: # If a section is very long, split it into sub-chunks at sentence boundaries if len(body) > chunk_size: # Reuse sentence_chunker on a temp Document sub_doc = Document(id=doc.id, text=body, metadata={ **doc.metadata, "section": label, }) sub_chunks = sentence_chunker(sub_doc, chunk_size=chunk_size) for sc in sub_chunks: chunks.append(Chunk( doc_id=doc.id, chunk_id=f"{doc.id}::{idx}", text=sc.text, start=offset + sc.start, end=offset + sc.end, metadata={**doc.metadata, "section": label}, )) idx += 1 else: if len(body) < min_section_chars and chunks: # tiny section — append to previous chunk prev = chunks[-1] prev.text = (prev.text + "\n" + label + " " + body).strip() prev.end = offset + len(body) continue chunks.append(Chunk( doc_id=doc.id, chunk_id=f"{doc.id}::{idx}", text=body, start=offset, end=offset + len(body), metadata={**doc.metadata, "section": label}, )) idx += 1 return chunks or sentence_chunker(doc, chunk_size=chunk_size)