File size: 1,449 Bytes
3be54c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""Sentence-aware chunker. Packs sentences until chunk_size is reached."""
from __future__ import annotations

import re
from typing import List

from ..core.types import Chunk, Document


_SENT = re.compile(r"(?<=[\.\!\?…])\s+|\n+")


def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]:
    sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()]
    if not sents:
        return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
                     text=doc.text, start=0, end=len(doc.text),
                     metadata=dict(doc.metadata))]

    chunks: List[Chunk] = []
    buf: List[str] = []
    buf_len = 0
    cursor = 0
    idx = 0

    def flush():
        nonlocal buf, buf_len, idx, cursor
        if not buf:
            return
        text = " ".join(buf)
        end = cursor + len(text)
        chunks.append(Chunk(
            doc_id=doc.id, chunk_id=f"{doc.id}::{idx}",
            text=text, start=cursor, end=end,
            metadata=dict(doc.metadata),
        ))
        cursor = end + 1
        idx += 1
        buf = []
        buf_len = 0

    for s in sents:
        if buf_len + len(s) > chunk_size and buf:
            flush()
        buf.append(s)
        buf_len += len(s) + 1
    flush()
    return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
                            text=doc.text, start=0, end=len(doc.text),
                            metadata=dict(doc.metadata))]