Coverage for app \ processing \ embedding.py: 100%
23 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
1from typing import List
3from sentence_transformers import SentenceTransformer
5from app.config import settings
6from app.utils.logger import get_logger
8logger = get_logger(__name__)
10_model: SentenceTransformer | None = None
13def _get_model() -> SentenceTransformer:
14 """
15 Lazy-load and cache the embedding model.
16 """
17 global _model
19 if _model is None:
20 logger.info(
21 "Loading embedding model",
22 extra={
23 "model_name": settings.EMBEDDING_MODEL_NAME,
24 "device": settings.EMBEDDING_DEVICE,
25 },
26 )
27 _model = SentenceTransformer(
28 settings.EMBEDDING_MODEL_NAME,
29 device=settings.EMBEDDING_DEVICE,
30 )
32 return _model
35def embed_texts(texts: List[str]) -> List[List[float]]:
36 """
37 Generate normalized embeddings for a list of texts.
38 Safe for batch processing.
39 """
40 if not texts or not isinstance(texts, list):
41 logger.warning("Embedding skipped: invalid input")
42 return []
44 clean_texts = [t for t in texts if isinstance(t, str) and t.strip()]
45 if not clean_texts:
46 logger.warning("Embedding skipped: empty text list")
47 return []
49 model = _get_model()
51 embeddings = model.encode(
52 clean_texts,
53 batch_size=settings.EMBEDDING_BATCH_SIZE,
54 normalize_embeddings=True,
55 show_progress_bar=False,
56 )
58 logger.debug(
59 "Embeddings generated",
60 extra={
61 "input_texts": len(clean_texts),
62 "embedding_dim": embeddings.shape[1],
63 },
64 )
66 return embeddings.tolist()