| """Legal-Hebrew chunker. |
| |
| Splits on common Hebrew legal section boundaries: סעיף N, פרק N, תת-סעיף N, |
| (א)/(ב)/(ג). Keeps each numbered unit as its own chunk and attaches the |
| section label to metadata — so the retriever can later filter by section |
| and the generator can cite "סעיף 5". |
| |
| Fallback: if no legal markers are found, falls back to sentence_chunker. |
| """ |
| from __future__ import annotations |
|
|
| import re |
| from typing import List, Tuple |
|
|
| from ..core.types import Chunk, Document |
| from .sentence import sentence_chunker |
|
|
|
|
| |
| |
| _SECTION_RE = re.compile( |
| r"(?m)^\s*(?:" |
| r"(?:סעיף|פרק|תת-סעיף|סימן|תקנה|פקודה|תקנות)\s+" |
| r"[\u0590-\u05FF0-9\"'\.]+|" |
| r"\([\u0590-\u05FF0-9]+\)|" |
| r"\d+\.\d+\.?\s|" |
| r"\[\d+\]" |
| r")\s*" |
| ) |
|
|
|
|
| def _split_by_sections(text: str) -> List[Tuple[str, str, int]]: |
| """Return [(label, body, start_offset), ...]. label is the section header.""" |
| matches = list(_SECTION_RE.finditer(text)) |
| if not matches: |
| return [] |
| out: List[Tuple[str, str, int]] = [] |
| for i, m in enumerate(matches): |
| start = m.start() |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(text) |
| header = m.group().strip() |
| body = text[m.end():end].strip() |
| if body: |
| out.append((header, body, start)) |
| return out |
|
|
|
|
| def legal_hebrew_chunker( |
| doc: Document, |
| chunk_size: int = 800, |
| min_section_chars: int = 8, |
| ) -> List[Chunk]: |
| """Primary: section-aware. Fallback: sentence_chunker.""" |
| sections = _split_by_sections(doc.text) |
| if not sections: |
| return sentence_chunker(doc, chunk_size=chunk_size) |
|
|
| chunks: List[Chunk] = [] |
| idx = 0 |
| for label, body, offset in sections: |
| |
| if len(body) > chunk_size: |
| |
| sub_doc = Document(id=doc.id, text=body, metadata={ |
| **doc.metadata, "section": label, |
| }) |
| sub_chunks = sentence_chunker(sub_doc, chunk_size=chunk_size) |
| for sc in sub_chunks: |
| chunks.append(Chunk( |
| doc_id=doc.id, |
| chunk_id=f"{doc.id}::{idx}", |
| text=sc.text, |
| start=offset + sc.start, |
| end=offset + sc.end, |
| metadata={**doc.metadata, "section": label}, |
| )) |
| idx += 1 |
| else: |
| if len(body) < min_section_chars and chunks: |
| |
| prev = chunks[-1] |
| prev.text = (prev.text + "\n" + label + " " + body).strip() |
| prev.end = offset + len(body) |
| continue |
| chunks.append(Chunk( |
| doc_id=doc.id, |
| chunk_id=f"{doc.id}::{idx}", |
| text=body, |
| start=offset, |
| end=offset + len(body), |
| metadata={**doc.metadata, "section": label}, |
| )) |
| idx += 1 |
|
|
| return chunks or sentence_chunker(doc, chunk_size=chunk_size) |
|
|