Spaces:

Legal-i
/

legal-eye

Running

File size: 1,449 Bytes

3be54c6

"""Sentence-aware chunker. Packs sentences until chunk_size is reached."""
from __future__ import annotations

import re
from typing import List

from ..core.types import Chunk, Document


_SENT = re.compile(r"(?<=[\.\!\?…])\s+|\n+")


def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]:
    sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()]
    if not sents:
        return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
                     text=doc.text, start=0, end=len(doc.text),
                     metadata=dict(doc.metadata))]

    chunks: List[Chunk] = []
    buf: List[str] = []
    buf_len = 0
    cursor = 0
    idx = 0

    def flush():
        nonlocal buf, buf_len, idx, cursor
        if not buf:
            return
        text = " ".join(buf)
        end = cursor + len(text)
        chunks.append(Chunk(
            doc_id=doc.id, chunk_id=f"{doc.id}::{idx}",
            text=text, start=cursor, end=end,
            metadata=dict(doc.metadata),
        ))
        cursor = end + 1
        idx += 1
        buf = []
        buf_len = 0

    for s in sents:
        if buf_len + len(s) > chunk_size and buf:
            flush()
        buf.append(s)
        buf_len += len(s) + 1
    flush()
    return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
                            text=doc.text, start=0, end=len(doc.text),
                            metadata=dict(doc.metadata))]