Source code for haive.core.engine.retriever.providers.KNNRetrieverConfig

"""K-Nearest Neighbors Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the KNN (K-Nearest Neighbors) retriever,
which uses k-nearest neighbors algorithm for document retrieval based on vector similarity.
KNN finds the k most similar documents to a query by computing distances in the embedding space.

The KNNRetriever works by:
1. Embedding documents and queries using a specified embedding model
2. Computing similarity/distance metrics between query and document embeddings
3. Finding the k nearest neighbors based on the distance metric
4. Returning the k most similar documents

This retriever is particularly useful when:
- Working with small to medium-sized document collections
- Need simple but effective similarity-based retrieval
- Want interpretable distance-based ranking
- Building baseline vector retrieval systems
- Comparing against more complex vector databases

The implementation integrates with LangChain's KNNRetriever while providing
a consistent Haive configuration interface.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType


[docs] @BaseRetrieverConfig.register(RetrieverType.KNN) class KNNRetrieverConfig(BaseRetrieverConfig): """Configuration for K-Nearest Neighbors retriever in the Haive framework. This retriever uses the KNN algorithm to find the most similar documents based on vector embeddings and distance metrics. Attributes: retriever_type (RetrieverType): The type of retriever (always KNN). documents (List[Document]): Documents to index for retrieval. k (int): Number of nearest neighbors to retrieve (default: 4). distance_metric (str): Distance metric to use ("cosine", "euclidean", "manhattan"). embedding_model (Optional[str]): Embedding model to use for vectorization. Examples: >>> from haive.core.engine.retriever import KNNRetrieverConfig >>> from langchain_core.documents import Document >>> >>> # Create documents >>> docs = [ ... Document(page_content="Machine learning trains models on data"), ... Document(page_content="Deep learning uses neural network architectures"), ... Document(page_content="Natural language processing analyzes text patterns") ... ] >>> >>> # Create the KNN retriever config >>> config = KNNRetrieverConfig( ... name="knn_retriever", ... documents=docs, ... k=2, ... distance_metric="cosine", ... embedding_model="sentence-transformers/all-MiniLM-L6-v2" ... ) >>> >>> # Instantiate and use the retriever >>> retriever = config.instantiate() >>> docs = retriever.get_relevant_documents("machine learning training process") """ retriever_type: RetrieverType = Field( default=RetrieverType.KNN, description="The type of retriever" ) # Documents to index documents: list[Document] = Field( default_factory=list, description="Documents to index for KNN retrieval" ) # Retrieval parameters k: int = Field( default=4, ge=1, le=100, description="Number of nearest neighbors to retrieve" ) # KNN algorithm parameters distance_metric: str = Field( default="cosine", description="Distance metric: 'cosine', 'euclidean', 'manhattan', 'hamming'", ) embedding_model: str | None = Field( default=None, description="Embedding model for vectorization (e.g., 'sentence-transformers/all-MiniLM-L6-v2')", ) # Additional KNN parameters algorithm: str = Field( default="auto", description="KNN algorithm: 'auto', 'ball_tree', 'kd_tree', 'brute'", )
[docs] def get_input_fields(self) -> dict[str, tuple[type, Any]]: """Return input field definitions for KNN retriever.""" return { "query": (str, Field(description="Text query for KNN similarity search")), }
[docs] def get_output_fields(self) -> dict[str, tuple[type, Any]]: """Return output field definitions for KNN retriever.""" return { "documents": ( list[Document], Field(default_factory=list, description="K nearest neighbor documents"), ), }
[docs] def instantiate(self) -> Any: """Create a KNN retriever from this configuration. Returns: KNNRetriever: Instantiated retriever ready for similarity search. Raises: ImportError: If required packages are not available. ValueError: If documents list is empty. """ try: from langchain_community.retrievers import KNNRetriever except ImportError: raise ImportError( "KNNRetriever requires langchain-community package. " "Install with: pip install langchain-community" ) if not self.documents: raise ValueError( "KNNRetriever requires a non-empty list of documents. " "Provide documents in the configuration." ) # Create KNN retriever with configuration return KNNRetriever.from_documents( documents=self.documents, k=self.k, distance_metric=self.distance_metric, embedding_model=self.embedding_model, algorithm=self.algorithm, )