Spaces:

Legal-i
/

legal-eye

Running

Initial deploy: legal-eye Hebrew legal RAG (17K corpus, verbatim-from-precedent)

3be54c6 verified about 1 month ago

1.45 kB

	"""Sentence-aware chunker. Packs sentences until chunk_size is reached."""
	from __future__ import annotations

	import re
	from typing import List

	from ..core.types import Chunk, Document


	_SENT = re.compile(r"(?<=[\.\!\?…])\s+\|\n+")


	def sentence_chunker(doc: Document, chunk_size: int = 600) -> List[Chunk]:
	sents = [s.strip() for s in _SENT.split(doc.text) if s.strip()]
	if not sents:
	return [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
	text=doc.text, start=0, end=len(doc.text),
	metadata=dict(doc.metadata))]

	chunks: List[Chunk] = []
	buf: List[str] = []
	buf_len = 0
	cursor = 0
	idx = 0

	def flush():
	nonlocal buf, buf_len, idx, cursor
	if not buf:
	return
	text = " ".join(buf)
	end = cursor + len(text)
	chunks.append(Chunk(
	doc_id=doc.id, chunk_id=f"{doc.id}::{idx}",
	text=text, start=cursor, end=end,
	metadata=dict(doc.metadata),
	))
	cursor = end + 1
	idx += 1
	buf = []
	buf_len = 0

	for s in sents:
	if buf_len + len(s) > chunk_size and buf:
	flush()
	buf.append(s)
	buf_len += len(s) + 1
	flush()
	return chunks or [Chunk(doc_id=doc.id, chunk_id=f"{doc.id}::0",
	text=doc.text, start=0, end=len(doc.text),
	metadata=dict(doc.metadata))]