Coverage for app \ fetchers \ pubmed_fetcher.py: 97%

67 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-24 13:18 +0530

1import time 

2from typing import List, Dict, Optional 

3 

4from Bio import Entrez 

5from bs4 import BeautifulSoup 

6 

7from app.config import settings 

8from app.utils.logger import get_logger 

9 

10logger = get_logger(__name__) 

11 

12# Required by NCBI 

13Entrez.email = settings.NCBI_EMAIL 

14Entrez.tool = "health_assistant_graph_rag" 

15 

16 

17def search_pmc_articles(query: str, max_results: int) -> List[str]: 

18 """ 

19 Search PubMed Central (PMC) for open-access full-text articles. 

20 """ 

21 full_query = f"{query} AND open access[filter]" 

22 logger.info("Searching PMC", extra={"query": query, "max_results": max_results}) 

23 

24 try: 

25 handle = Entrez.esearch( 

26 db="pmc", 

27 term=full_query, 

28 retmax=max_results, 

29 ) 

30 record = Entrez.read(handle) 

31 handle.close() 

32 

33 pmc_ids = record.get("IdList", []) 

34 logger.info("PMC search complete", extra={"results": len(pmc_ids)}) 

35 return pmc_ids 

36 

37 except Exception: 

38 logger.exception("PMC search failed") 

39 return [] 

40 

41 

42def _extract_full_text(soup: BeautifulSoup) -> str: 

43 """ 

44 Extract readable full text from PMC XML <body>. 

45 """ 

46 body = soup.find("body") 

47 if not body: 

48 return "" 

49 

50 sections = [] 

51 for sec in body.find_all("sec"): 

52 title_tag = sec.find("title") 

53 section_title = title_tag.get_text(strip=True) if title_tag else "Section" 

54 paragraphs = [p.get_text(strip=True) for p in sec.find_all("p")] 

55 

56 if paragraphs: 

57 sections.append(f"## {section_title}\n" + "\n".join(paragraphs)) 

58 

59 return "\n\n".join(sections) 

60 

61 

62def fetch_pmc_details(pmc_id: str) -> Optional[Dict]: 

63 """ 

64 Fetch and parse a single PMC full-text article. 

65 """ 

66 try: 

67 handle = Entrez.efetch( 

68 db="pmc", 

69 id=pmc_id, 

70 rettype="full", 

71 retmode="xml", 

72 ) 

73 xml_data = handle.read() 

74 handle.close() 

75 

76 soup = BeautifulSoup(xml_data, "lxml-xml") 

77 

78 article = { 

79 "pmid": pmc_id, 

80 "title": "No Title", 

81 "journal": "Unknown Journal", 

82 "year": 0, 

83 "abstract": "", 

84 "study_type": "Full Text Article", 

85 } 

86 

87 title_tag = soup.find("article-title") 

88 if title_tag: 

89 article["title"] = title_tag.get_text(strip=True) 

90 

91 journal_tag = soup.find("journal-title") 

92 if journal_tag: 

93 article["journal"] = journal_tag.get_text(strip=True) 

94 

95 pub_date = soup.find("pub-date", {"pub-type": "epub"}) or soup.find("pub-date") 

96 if pub_date and pub_date.find("year"): 

97 article["year"] = int(pub_date.find("year").get_text()) 

98 

99 full_text = _extract_full_text(soup) 

100 article["abstract"] = full_text 

101 

102 return article 

103 

104 except Exception: 

105 logger.exception("Failed to fetch PMC article", extra={"pmc_id": pmc_id}) 

106 return None 

107 

108 

109def fetch_all_pmc_articles(query: str, max_results: int = 5) -> List[Dict]: 

110 """ 

111 End-to-end PMC fetch pipeline. 

112 """ 

113 pmc_ids = search_pmc_articles(query, max_results) 

114 articles: List[Dict] = [] 

115 

116 for pmc_id in pmc_ids: 

117 article = fetch_pmc_details(pmc_id) 

118 if article and len(article.get("abstract", "")) > settings.MIN_TEXT_LENGTH: 

119 articles.append(article) 

120 logger.info("PMC article processed", extra={"pmc_id": pmc_id}) 

121 

122 time.sleep(settings.NCBI_REQUEST_DELAY) 

123 

124 logger.info("PMC ingestion complete", extra={"articles": len(articles)}) 

125 return articles