Source code for haive.core.engine.retriever.providers.BM25RetrieverConfig

"""BM25 Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the BM25 (Best Matching 25) retriever,
which uses the BM25 ranking function for text retrieval. BM25 is a probabilistic
ranking function used by search engines to estimate the relevance of documents
to a given search query.

The BM25Retriever works by:
1. Tokenizing and preprocessing documents and queries
2. Computing BM25 scores for each document-query pair
3. Ranking documents by their BM25 scores
4. Returning the top-k most relevant documents

This retriever is particularly useful when:
- Working with text-based document collections
- Need precise keyword matching and term frequency analysis
- Want interpretable ranking scores
- Building traditional information retrieval systems
- Combining with vector search in hybrid approaches

The implementation integrates with LangChain's BM25Retriever while providing
a consistent Haive configuration interface.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType



[docs]
@BaseRetrieverConfig.register(RetrieverType.BM25)
class BM25RetrieverConfig(BaseRetrieverConfig):
    """Configuration for BM25 retriever in the Haive framework.

    This retriever uses the BM25 ranking function to score documents based on
    term frequency, inverse document frequency, and document length normalization.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always BM25).
        documents (List[Document]): Documents to index for retrieval.
        k (int): Number of documents to retrieve (default: 4).
        k1 (float): BM25 parameter controlling term frequency saturation (default: 1.2).
        b (float): BM25 parameter controlling document length normalization (default: 0.75).
        epsilon (float): BM25 parameter for IDF calculation (default: 0.25).

    Examples:
        >>> from haive.core.engine.retriever import BM25RetrieverConfig
        >>> from langchain_core.documents import Document
        >>>
        >>> # Create documents
        >>> docs = [
        ...     Document(page_content="Machine learning is a subset of AI"),
        ...     Document(page_content="Deep learning uses neural networks"),
        ...     Document(page_content="Natural language processing handles text")
        ... ]
        >>>
        >>> # Create the BM25 retriever config
        >>> config = BM25RetrieverConfig(
        ...     name="bm25_retriever",
        ...     documents=docs,
        ...     k=2,
        ...     k1=1.5,  # Higher term frequency saturation
        ...     b=0.8    # More document length normalization
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("machine learning algorithms")
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.BM25, description="The type of retriever"
    )

    # Documents to index
    documents: list[Document] = Field(
        default_factory=list, description="Documents to index for BM25 retrieval"
    )

    # Retrieval parameters
    k: int = Field(
        default=4, ge=1, le=100, description="Number of documents to retrieve"
    )

    # BM25 algorithm parameters
    k1: float = Field(
        default=1.2,
        ge=0.0,
        le=3.0,
        description="BM25 k1 parameter controlling term frequency saturation",
    )

    b: float = Field(
        default=0.75,
        ge=0.0,
        le=1.0,
        description="BM25 b parameter controlling document length normalization",
    )

    epsilon: float = Field(
        default=0.25,
        ge=0.0,
        le=1.0,
        description="BM25 epsilon parameter for IDF calculation",
    )


[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for BM25 retriever."""
        return {
            "query": (str, Field(description="Text query for BM25 ranking")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for BM25 retriever."""
        return {
            "documents": (
                list[Document],
                Field(
                    default_factory=list, description="Documents ranked by BM25 scores"
                ),
            ),
        }



[docs]
    def instantiate(self) -> Any:
        """Create a BM25 retriever from this configuration.

        Returns:
            BM25Retriever: Instantiated retriever ready for text ranking.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If documents list is empty.
        """
        try:
            from langchain_community.retrievers import BM25Retriever
        except ImportError:
            raise ImportError(
                "BM25Retriever requires langchain-community package. "
                "Install with: pip install langchain-community"
            )

        if not self.documents:
            raise ValueError(
                "BM25Retriever requires a non-empty list of documents. "
                "Provide documents in the configuration."
            )

        return BM25Retriever.from_documents(
            documents=self.documents,
            k=self.k,
            k1=self.k1,
            b=self.b,
            epsilon=self.epsilon,
        )