Coverage for app \ fetchers \ pubmed_fetcher.py: 97%
67 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 13:18 +0530
1import time
2from typing import List, Dict, Optional
4from Bio import Entrez
5from bs4 import BeautifulSoup
7from app.config import settings
8from app.utils.logger import get_logger
10logger = get_logger(__name__)
12# Required by NCBI
13Entrez.email = settings.NCBI_EMAIL
14Entrez.tool = "health_assistant_graph_rag"
17def search_pmc_articles(query: str, max_results: int) -> List[str]:
18 """
19 Search PubMed Central (PMC) for open-access full-text articles.
20 """
21 full_query = f"{query} AND open access[filter]"
22 logger.info("Searching PMC", extra={"query": query, "max_results": max_results})
24 try:
25 handle = Entrez.esearch(
26 db="pmc",
27 term=full_query,
28 retmax=max_results,
29 )
30 record = Entrez.read(handle)
31 handle.close()
33 pmc_ids = record.get("IdList", [])
34 logger.info("PMC search complete", extra={"results": len(pmc_ids)})
35 return pmc_ids
37 except Exception:
38 logger.exception("PMC search failed")
39 return []
42def _extract_full_text(soup: BeautifulSoup) -> str:
43 """
44 Extract readable full text from PMC XML <body>.
45 """
46 body = soup.find("body")
47 if not body:
48 return ""
50 sections = []
51 for sec in body.find_all("sec"):
52 title_tag = sec.find("title")
53 section_title = title_tag.get_text(strip=True) if title_tag else "Section"
54 paragraphs = [p.get_text(strip=True) for p in sec.find_all("p")]
56 if paragraphs:
57 sections.append(f"## {section_title}\n" + "\n".join(paragraphs))
59 return "\n\n".join(sections)
62def fetch_pmc_details(pmc_id: str) -> Optional[Dict]:
63 """
64 Fetch and parse a single PMC full-text article.
65 """
66 try:
67 handle = Entrez.efetch(
68 db="pmc",
69 id=pmc_id,
70 rettype="full",
71 retmode="xml",
72 )
73 xml_data = handle.read()
74 handle.close()
76 soup = BeautifulSoup(xml_data, "lxml-xml")
78 article = {
79 "pmid": pmc_id,
80 "title": "No Title",
81 "journal": "Unknown Journal",
82 "year": 0,
83 "abstract": "",
84 "study_type": "Full Text Article",
85 }
87 title_tag = soup.find("article-title")
88 if title_tag:
89 article["title"] = title_tag.get_text(strip=True)
91 journal_tag = soup.find("journal-title")
92 if journal_tag:
93 article["journal"] = journal_tag.get_text(strip=True)
95 pub_date = soup.find("pub-date", {"pub-type": "epub"}) or soup.find("pub-date")
96 if pub_date and pub_date.find("year"):
97 article["year"] = int(pub_date.find("year").get_text())
99 full_text = _extract_full_text(soup)
100 article["abstract"] = full_text
102 return article
104 except Exception:
105 logger.exception("Failed to fetch PMC article", extra={"pmc_id": pmc_id})
106 return None
109def fetch_all_pmc_articles(query: str, max_results: int = 5) -> List[Dict]:
110 """
111 End-to-end PMC fetch pipeline.
112 """
113 pmc_ids = search_pmc_articles(query, max_results)
114 articles: List[Dict] = []
116 for pmc_id in pmc_ids:
117 article = fetch_pmc_details(pmc_id)
118 if article and len(article.get("abstract", "")) > settings.MIN_TEXT_LENGTH:
119 articles.append(article)
120 logger.info("PMC article processed", extra={"pmc_id": pmc_id})
122 time.sleep(settings.NCBI_REQUEST_DELAY)
124 logger.info("PMC ingestion complete", extra={"articles": len(articles)})
125 return articles