Coverage for app \ processing \ chunker.py: 100%
15 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
1from typing import List
3from langchain_text_splitters import RecursiveCharacterTextSplitter
5from app.config import settings
6from app.utils.logger import get_logger
8logger = get_logger(__name__)
11def simple_chunk(
12 text: str,
13 chunk_size: int | None = None,
14 overlap: int | None = None,
15) -> List[str]:
16 """
17 Split text into overlapping chunks suitable for embeddings and NER.
19 Safe for repeated calls and large documents.
20 """
21 if not text or not isinstance(text, str):
22 logger.warning("Chunking skipped: invalid or empty text")
23 return []
25 size = chunk_size or settings.CHUNK_SIZE
26 ovlp = overlap or settings.CHUNK_OVERLAP
28 splitter = RecursiveCharacterTextSplitter(
29 chunk_size=size,
30 chunk_overlap=ovlp,
31 separators=["\n\n", "\n", ".", " ", ""],
32 )
34 chunks = splitter.split_text(text)
36 logger.debug(
37 "Text chunked",
38 extra={
39 "chunks": len(chunks),
40 "chunk_size": size,
41 "overlap": ovlp,
42 },
43 )
45 return chunks