"""Sentence-aware chunker. Packs sentences until chunk_size is reached.""" from __future__ import annotations import re from typing import List from ..core.types import Chunk, Document _SENT = re.compile(r"(?<=[\.\!\?…])\s+|\n+") def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]: sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()] if not sents: return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0", text=doc.text, start=0, end=len(doc.text), metadata=dict(doc.metadata))] chunks: List[Chunk] = [] buf: List[str] = [] buf_len = 0 cursor = 0 idx = 0 def flush(): nonlocal buf, buf_len, idx, cursor if not buf: return text = " ".join(buf) end = cursor + len(text) chunks.append(Chunk( doc_id=doc.id, chunk_id=f"{doc.id}::{idx}", text=text, start=cursor, end=end, metadata=dict(doc.metadata), )) cursor = end + 1 idx += 1 buf = [] buf_len = 0 for s in sents: if buf_len + len(s) > chunk_size and buf: flush() buf.append(s) buf_len += len(s) + 1 flush() return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0", text=doc.text, start=0, end=len(doc.text), metadata=dict(doc.metadata))]