Source code for haive.core.engine.retriever.providers.BM25RetrieverConfig
"""BM25 Retriever implementation for the Haive framework.from typing import AnyThis module provides a configuration class for the BM25 (Best Matching 25) retriever,which uses the BM25 ranking function for text retrieval. BM25 is a probabilisticranking function used by search engines to estimate the relevance of documentsto a given search query.The BM25Retriever works by:1. Tokenizing and preprocessing documents and queries2. Computing BM25 scores for each document-query pair3. Ranking documents by their BM25 scores4. Returning the top-k most relevant documentsThis retriever is particularly useful when:- Working with text-based document collections- Need precise keyword matching and term frequency analysis- Want interpretable ranking scores- Building traditional information retrieval systems- Combining with vector search in hybrid approachesThe implementation integrates with LangChain's BM25Retriever while providinga consistent Haive configuration interface."""fromtypingimportAnyfromlangchain_core.documentsimportDocumentfrompydanticimportFieldfromhaive.core.engine.retriever.retrieverimportBaseRetrieverConfigfromhaive.core.engine.retriever.typesimportRetrieverType
[docs]@BaseRetrieverConfig.register(RetrieverType.BM25)classBM25RetrieverConfig(BaseRetrieverConfig):"""Configuration for BM25 retriever in the Haive framework. This retriever uses the BM25 ranking function to score documents based on term frequency, inverse document frequency, and document length normalization. Attributes: retriever_type (RetrieverType): The type of retriever (always BM25). documents (List[Document]): Documents to index for retrieval. k (int): Number of documents to retrieve (default: 4). k1 (float): BM25 parameter controlling term frequency saturation (default: 1.2). b (float): BM25 parameter controlling document length normalization (default: 0.75). epsilon (float): BM25 parameter for IDF calculation (default: 0.25). Examples: >>> from haive.core.engine.retriever import BM25RetrieverConfig >>> from langchain_core.documents import Document >>> >>> # Create documents >>> docs = [ ... Document(page_content="Machine learning is a subset of AI"), ... Document(page_content="Deep learning uses neural networks"), ... Document(page_content="Natural language processing handles text") ... ] >>> >>> # Create the BM25 retriever config >>> config = BM25RetrieverConfig( ... name="bm25_retriever", ... documents=docs, ... k=2, ... k1=1.5, # Higher term frequency saturation ... b=0.8 # More document length normalization ... ) >>> >>> # Instantiate and use the retriever >>> retriever = config.instantiate() >>> docs = retriever.get_relevant_documents("machine learning algorithms") """retriever_type:RetrieverType=Field(default=RetrieverType.BM25,description="The type of retriever")# Documents to indexdocuments:list[Document]=Field(default_factory=list,description="Documents to index for BM25 retrieval")# Retrieval parametersk:int=Field(default=4,ge=1,le=100,description="Number of documents to retrieve")# BM25 algorithm parametersk1:float=Field(default=1.2,ge=0.0,le=3.0,description="BM25 k1 parameter controlling term frequency saturation",)b:float=Field(default=0.75,ge=0.0,le=1.0,description="BM25 b parameter controlling document length normalization",)epsilon:float=Field(default=0.25,ge=0.0,le=1.0,description="BM25 epsilon parameter for IDF calculation",)
[docs]defget_input_fields(self)->dict[str,tuple[type,Any]]:"""Return input field definitions for BM25 retriever."""return{"query":(str,Field(description="Text query for BM25 ranking")),}
[docs]defget_output_fields(self)->dict[str,tuple[type,Any]]:"""Return output field definitions for BM25 retriever."""return{"documents":(list[Document],Field(default_factory=list,description="Documents ranked by BM25 scores"),),}
[docs]definstantiate(self)->Any:"""Create a BM25 retriever from this configuration. Returns: BM25Retriever: Instantiated retriever ready for text ranking. Raises: ImportError: If required packages are not available. ValueError: If documents list is empty. """try:fromlangchain_community.retrieversimportBM25RetrieverexceptImportError:raiseImportError("BM25Retriever requires langchain-community package. ""Install with: pip install langchain-community")ifnotself.documents:raiseValueError("BM25Retriever requires a non-empty list of documents. ""Provide documents in the configuration.")returnBM25Retriever.from_documents(documents=self.documents,k=self.k,k1=self.k1,b=self.b,epsilon=self.epsilon,)