Source code for haive.core.engine.retriever.providers.PubMedRetrieverConfig

"""PubMed Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the PubMed retriever,
which retrieves biomedical and life science literature from the PubMed database.
PubMed is a free search engine accessing primarily the MEDLINE database of references
and abstracts on life sciences and biomedical topics.

The PubMedRetriever works by:
1. Connecting to the PubMed API (via NCBI E-utilities)
2. Executing search queries against the PubMed database
3. Retrieving article abstracts and metadata
4. Returning formatted documents with biomedical literature

This retriever is particularly useful when:
- Building medical or healthcare applications
- Researching biomedical topics and treatments
- Creating evidence-based medicine tools
- Developing clinical decision support systems
- Building scientific literature review applications

The implementation integrates with LangChain's PubMedRetriever while providing
a consistent Haive configuration interface.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType


[docs] @BaseRetrieverConfig.register(RetrieverType.PUBMED) class PubMedRetrieverConfig(BaseRetrieverConfig): """Configuration for PubMed retriever in the Haive framework. This retriever searches the PubMed database for biomedical literature and returns article abstracts and metadata as documents. Attributes: retriever_type (RetrieverType): The type of retriever (always PUBMED). top_k_results (int): Number of articles to retrieve (default: 3). load_max_docs (int): Maximum number of documents to load (default: 25). load_all_available_meta (bool): Whether to load all available metadata. doc_content_chars_max (int): Maximum characters per document. email (Optional[str]): Email for NCBI API (recommended for higher rate limits). Examples: >>> from haive.core.engine.retriever import PubMedRetrieverConfig >>> >>> # Create the PubMed retriever config >>> config = PubMedRetrieverConfig( ... name="pubmed_retriever", ... top_k_results=5, ... load_max_docs=20, ... load_all_available_meta=True, ... email="researcher@university.edu" # Optional but recommended ... ) >>> >>> # Instantiate and use the retriever >>> retriever = config.instantiate() >>> docs = retriever.get_relevant_documents("COVID-19 vaccine effectiveness") >>> >>> # Example with specific medical query >>> docs = retriever.get_relevant_documents("CRISPR gene editing cancer treatment") """ retriever_type: RetrieverType = Field( default=RetrieverType.PUBMED, description="The type of retriever" ) # Search parameters top_k_results: int = Field( default=3, ge=1, le=100, description="Number of articles to retrieve from PubMed", ) load_max_docs: int = Field( default=25, ge=1, le=200, description="Maximum number of documents to load" ) # Content parameters load_all_available_meta: bool = Field( default=False, description="Whether to load all available metadata fields" ) doc_content_chars_max: int = Field( default=4000, ge=500, le=10000, description="Maximum characters per document content", ) # API configuration email: str | None = Field( default=None, description="Email address for NCBI API (recommended for higher rate limits)", ) # Search filters min_year: int | None = Field( default=None, ge=1900, le=2030, description="Minimum publication year filter" ) max_year: int | None = Field( default=None, ge=1900, le=2030, description="Maximum publication year filter" )
[docs] def get_input_fields(self) -> dict[str, tuple[type, Any]]: """Return input field definitions for PubMed retriever.""" return { "query": (str, Field(description="Biomedical search query for PubMed")), }
[docs] def get_output_fields(self) -> dict[str, tuple[type, Any]]: """Return output field definitions for PubMed retriever.""" return { "documents": ( list[Document], Field( default_factory=list, description="Biomedical articles from PubMed" ), ), }
[docs] def instantiate(self) -> Any: """Create a PubMed retriever from this configuration. Returns: PubMedRetriever: Instantiated retriever ready for biomedical literature search. Raises: ImportError: If required packages are not available. """ try: from langchain_community.retrievers import PubMedRetriever except ImportError: raise ImportError( "PubMedRetriever requires langchain-community package. " "Install with: pip install langchain-community" ) # Prepare configuration parameters config_params = { "top_k_results": self.top_k_results, "load_max_docs": self.load_max_docs, "load_all_available_meta": self.load_all_available_meta, "doc_content_chars_max": self.doc_content_chars_max, } # Add optional parameters if self.email: config_params["email"] = self.email # Note: PubMed date filtering is typically done in the query string # e.g., "COVID-19 AND 2020:2024[dp]" for date range return PubMedRetriever(**config_params)