Coverage for app \ processing \ embedding.py: 100%

23 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-24 13:18 +0530

1from typing import List 

2 

3from sentence_transformers import SentenceTransformer 

4 

5from app.config import settings 

6from app.utils.logger import get_logger 

7 

8logger = get_logger(__name__) 

9 

10_model: SentenceTransformer | None = None 

11 

12 

13def _get_model() -> SentenceTransformer: 

14 """ 

15 Lazy-load and cache the embedding model. 

16 """ 

17 global _model 

18 

19 if _model is None: 

20 logger.info( 

21 "Loading embedding model", 

22 extra={ 

23 "model_name": settings.EMBEDDING_MODEL_NAME, 

24 "device": settings.EMBEDDING_DEVICE, 

25 }, 

26 ) 

27 _model = SentenceTransformer( 

28 settings.EMBEDDING_MODEL_NAME, 

29 device=settings.EMBEDDING_DEVICE, 

30 ) 

31 

32 return _model 

33 

34 

35def embed_texts(texts: List[str]) -> List[List[float]]: 

36 """ 

37 Generate normalized embeddings for a list of texts. 

38 Safe for batch processing. 

39 """ 

40 if not texts or not isinstance(texts, list): 

41 logger.warning("Embedding skipped: invalid input") 

42 return [] 

43 

44 clean_texts = [t for t in texts if isinstance(t, str) and t.strip()] 

45 if not clean_texts: 

46 logger.warning("Embedding skipped: empty text list") 

47 return [] 

48 

49 model = _get_model() 

50 

51 embeddings = model.encode( 

52 clean_texts, 

53 batch_size=settings.EMBEDDING_BATCH_SIZE, 

54 normalize_embeddings=True, 

55 show_progress_bar=False, 

56 ) 

57 

58 logger.debug( 

59 "Embeddings generated", 

60 extra={ 

61 "input_texts": len(clean_texts), 

62 "embedding_dim": embeddings.shape[1], 

63 }, 

64 ) 

65 

66 return embeddings.tolist()