Coverage for app\fetchers\pubmed

1import time

2from typing import List, Dict, Optional

4from Bio import Entrez

5from bs4 import BeautifulSoup

7from app.config import settings

8from app.utils.logger import get_logger

10logger = get_logger(__name__)

12# Required by NCBI

13Entrez.email = settings.NCBI_EMAIL

14Entrez.tool = "health_assistant_graph_rag"

17def search_pmc_articles(query: str, max_results: int) -> List[str]:

18 """

19 Search PubMed Central (PMC) for open-access full-text articles.

20 """

21 full_query = f"{query} AND open access[filter]"

22 logger.info("Searching PMC", extra={"query": query, "max_results": max_results})

24 try:

25 handle = Entrez.esearch(

26 db="pmc",

27 term=full_query,

28 retmax=max_results,

29 )

30 record = Entrez.read(handle)

31 handle.close()

33 pmc_ids = record.get("IdList", [])

34 logger.info("PMC search complete", extra={"results": len(pmc_ids)})

35 return pmc_ids

37 except Exception:

38 logger.exception("PMC search failed")

39 return []

42def _extract_full_text(soup: BeautifulSoup) -> str:

43 """

44 Extract readable full text from PMC XML <body>.

45 """

46 body = soup.find("body")

47 if not body:

48 return ""

50 sections = []

51 for sec in body.find_all("sec"):

52 title_tag = sec.find("title")

53 section_title = title_tag.get_text(strip=True) if title_tag else "Section"

54 paragraphs = [p.get_text(strip=True) for p in sec.find_all("p")]

56 if paragraphs:

57 sections.append(f"## {section_title}\n" + "\n".join(paragraphs))

59 return "\n\n".join(sections)

62def fetch_pmc_details(pmc_id: str) -> Optional[Dict]:

63 """

64 Fetch and parse a single PMC full-text article.

65 """

66 try:

67 handle = Entrez.efetch(

68 db="pmc",

69 id=pmc_id,

70 rettype="full",

71 retmode="xml",

72 )

73 xml_data = handle.read()

74 handle.close()

76 soup = BeautifulSoup(xml_data, "lxml-xml")

78 article = {

79 "pmid": pmc_id,

80 "title": "No Title",

81 "journal": "Unknown Journal",

82 "year": 0,

83 "abstract": "",

84 "study_type": "Full Text Article",

85 }

87 title_tag = soup.find("article-title")

88 if title_tag:

89 article["title"] = title_tag.get_text(strip=True)

91 journal_tag = soup.find("journal-title")

92 if journal_tag:

93 article["journal"] = journal_tag.get_text(strip=True)

95 pub_date = soup.find("pub-date", {"pub-type": "epub"}) or soup.find("pub-date")

96 if pub_date and pub_date.find("year"):

97 article["year"] = int(pub_date.find("year").get_text())

99 full_text = _extract_full_text(soup)

100 article["abstract"] = full_text

101

102 return article

103

104 except Exception:

105 logger.exception("Failed to fetch PMC article", extra={"pmc_id": pmc_id})

106 return None

107

108

109def fetch_all_pmc_articles(query: str, max_results: int = 5) -> List[Dict]:

110 """

111 End-to-end PMC fetch pipeline.

112 """

113 pmc_ids = search_pmc_articles(query, max_results)

114 articles: List[Dict] = []

115

116 for pmc_id in pmc_ids:

117 article = fetch_pmc_details(pmc_id)

118 if article and len(article.get("abstract", "")) > settings.MIN_TEXT_LENGTH:

119 articles.append(article)

120 logger.info("PMC article processed", extra={"pmc_id": pmc_id})

121

122 time.sleep(settings.NCBI_REQUEST_DELAY)

123

124 logger.info("PMC ingestion complete", extra={"articles": len(articles)})

125 return articles

Coverage for app \ fetchers \ pubmed_fetcher.py: 97%

67 statements