Coverage for app \ processing \ chunker.py: 100%

15 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-24 13:18 +0530

1from typing import List 

2 

3from langchain_text_splitters import RecursiveCharacterTextSplitter 

4 

5from app.config import settings 

6from app.utils.logger import get_logger 

7 

8logger = get_logger(__name__) 

9 

10 

11def simple_chunk( 

12 text: str, 

13 chunk_size: int | None = None, 

14 overlap: int | None = None, 

15) -> List[str]: 

16 """ 

17 Split text into overlapping chunks suitable for embeddings and NER. 

18 

19 Safe for repeated calls and large documents. 

20 """ 

21 if not text or not isinstance(text, str): 

22 logger.warning("Chunking skipped: invalid or empty text") 

23 return [] 

24 

25 size = chunk_size or settings.CHUNK_SIZE 

26 ovlp = overlap or settings.CHUNK_OVERLAP 

27 

28 splitter = RecursiveCharacterTextSplitter( 

29 chunk_size=size, 

30 chunk_overlap=ovlp, 

31 separators=["\n\n", "\n", ".", " ", ""], 

32 ) 

33 

34 chunks = splitter.split_text(text) 

35 

36 logger.debug( 

37 "Text chunked", 

38 extra={ 

39 "chunks": len(chunks), 

40 "chunk_size": size, 

41 "overlap": ovlp, 

42 }, 

43 ) 

44 

45 return chunks