| """Sentence-aware chunker. Packs sentences until chunk_size is reached.""" |
| from __future__ import annotations |
|
|
| import re |
| from typing import List |
|
|
| from ..core.types import Chunk, Document |
|
|
|
|
| _SENT = re.compile(r"(?<=[\.\!\?…])\s+|\n+") |
|
|
|
|
| def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]: |
| sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()] |
| if not sents: |
| return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0", |
| text=doc.text, start=0, end=len(doc.text), |
| metadata=dict(doc.metadata))] |
|
|
| chunks: List[Chunk] = [] |
| buf: List[str] = [] |
| buf_len = 0 |
| cursor = 0 |
| idx = 0 |
|
|
| def flush(): |
| nonlocal buf, buf_len, idx, cursor |
| if not buf: |
| return |
| text = " ".join(buf) |
| end = cursor + len(text) |
| chunks.append(Chunk( |
| doc_id=doc.id, chunk_id=f"{doc.id}::{idx}", |
| text=text, start=cursor, end=end, |
| metadata=dict(doc.metadata), |
| )) |
| cursor = end + 1 |
| idx += 1 |
| buf = [] |
| buf_len = 0 |
|
|
| for s in sents: |
| if buf_len + len(s) > chunk_size and buf: |
| flush() |
| buf.append(s) |
| buf_len += len(s) + 1 |
| flush() |
| return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0", |
| text=doc.text, start=0, end=len(doc.text), |
| metadata=dict(doc.metadata))] |
|
|