legal-eye / tau_rag /chunking /sentence.py
Legal-i's picture
Initial deploy: legal-eye Hebrew legal RAG (17K corpus, verbatim-from-precedent)
3be54c6 verified
raw
history blame contribute delete
1.45 kB
"""Sentence-aware chunker. Packs sentences until chunk_size is reached."""
from __future__ import annotations
import re
from typing import List
from ..core.types import Chunk, Document
_SENT = re.compile(r"(?<=[\.\!\?…])\s+|\n+")
def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]:
sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()]
if not sents:
return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
text=doc.text, start=0, end=len(doc.text),
metadata=dict(doc.metadata))]
chunks: List[Chunk] = []
buf: List[str] = []
buf_len = 0
cursor = 0
idx = 0
def flush():
nonlocal buf, buf_len, idx, cursor
if not buf:
return
text = " ".join(buf)
end = cursor + len(text)
chunks.append(Chunk(
doc_id=doc.id, chunk_id=f"{doc.id}::{idx}",
text=text, start=cursor, end=end,
metadata=dict(doc.metadata),
))
cursor = end + 1
idx += 1
buf = []
buf_len = 0
for s in sents:
if buf_len + len(s) > chunk_size and buf:
flush()
buf.append(s)
buf_len += len(s) + 1
flush()
return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
text=doc.text, start=0, end=len(doc.text),
metadata=dict(doc.metadata))]