Source code for haive.core.engine.retriever.providers.VespaRetrieverConfig

"""Vespa Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the Vespa retriever,
which uses Vespa search engine for advanced search and retrieval capabilities.
Vespa is a fully featured search engine and vector database which supports
vector search, lexical search, and hybrid ranking in a single query.

The VespaRetriever works by:
1. Connecting to a Vespa application
2. Supporting both vector and text search simultaneously
3. Providing advanced ranking and filtering capabilities
4. Enabling real-time search and content updates

This retriever is particularly useful when:
- Need hybrid search combining vector and text search
- Require real-time search with continuous updates
- Want advanced ranking and relevance tuning
- Building large-scale search applications
- Need both structured and unstructured data search

The implementation integrates with LangChain's Vespa retriever while
providing a consistent Haive configuration interface.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType



[docs]
@BaseRetrieverConfig.register(RetrieverType.VESPA)
class VespaRetrieverConfig(BaseRetrieverConfig):
    """Configuration for Vespa retriever in the Haive framework.

    This retriever uses Vespa search engine to perform hybrid search
    combining vector similarity and text search capabilities.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always VESPA).
        url (str): Vespa application URL.
        content_field (str): Field containing document content.
        k (int): Number of documents to retrieve.
        metadata_fields (List[str]): Fields to include in metadata.
        vespa_query_body (Optional[Dict]): Custom Vespa query configuration.

    Examples:
        >>> from haive.core.engine.retriever import VespaRetrieverConfig
        >>>
        >>> # Create the Vespa retriever config
        >>> config = VespaRetrieverConfig(
        ...     name="vespa_retriever",
        ...     url="http://localhost:8080",
        ...     content_field="content",
        ...     k=10,
        ...     metadata_fields=["title", "author", "category"],
        ...     vespa_query_body={
        ...         "yql": "select * from sources * where userQuery()",
        ...         "hits": 10,
        ...         "ranking": "bm25"
        ...     }
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("machine learning neural networks")
        >>>
        >>> # Example with hybrid search
        >>> hybrid_config = VespaRetrieverConfig(
        ...     name="vespa_hybrid_retriever",
        ...     url="http://localhost:8080",
        ...     content_field="content",
        ...     vespa_query_body={
        ...         "yql": "select * from sources * where ({targetHits:10}nearestNeighbor(embedding,q)) or userQuery()",
        ...         "ranking": "hybrid",
        ...         "input.query(q)": "embed(@query)"
        ...     }
        ... )
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.VESPA, description="The type of retriever"
    )

    # Vespa connection configuration
    url: str = Field(
        ..., description="Vespa application URL (e.g., 'http://localhost:8080')"
    )

    content_field: str = Field(
        default="content", description="Field containing document content"
    )

    # Search parameters
    k: int = Field(
        default=10, ge=1, le=100, description="Number of documents to retrieve"
    )

    metadata_fields: list[str] = Field(
        default_factory=list,
        description="List of fields to include in document metadata",
    )

    # Vespa query configuration
    vespa_query_body: dict[str, Any] | None = Field(
        default=None, description="Custom Vespa query body configuration"
    )

    # Advanced parameters
    ranking_profile: str = Field(
        default="default", description="Vespa ranking profile to use"
    )

    query_model: str = Field(
        default="simple", description="Query model: 'simple', 'all', 'any', 'weakAnd'"
    )

    timeout: float = Field(
        default=30.0, ge=0.1, le=300.0, description="Query timeout in seconds"
    )


[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for Vespa retriever."""
        return {
            "query": (str, Field(description="Search query for Vespa")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for Vespa retriever."""
        return {
            "documents": (
                list[Document],
                Field(default_factory=list, description="Documents from Vespa search"),
            ),
        }



[docs]
    def instantiate(self) -> Any:
        """Create a Vespa retriever from this configuration.

        Returns:
            VespaRetriever: Instantiated retriever ready for hybrid search.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If configuration is invalid.
        """
        try:
            from langchain_community.retrievers import VespaRetriever
        except ImportError:
            raise ImportError(
                "VespaRetriever requires langchain-community and pyvespa packages. "
                "Install with: pip install langchain-community pyvespa"
            )

        # Prepare configuration
        config = {"url": self.url, "content_field": self.content_field, "k": self.k}

        # Add metadata fields if specified
        if self.metadata_fields:
            config["metadata_fields"] = self.metadata_fields

        # Configure query body
        if self.vespa_query_body:
            query_body = self.vespa_query_body.copy()
        else:
            # Default query body
            query_body = {
                "yql": "select * from sources * where userQuery()",
                "hits": self.k,
                "ranking": self.ranking_profile,
                "type": self.query_model,
                "timeout": f"{int(self.timeout * 1000)}ms",
            }

        config["vespa_query_body"] = query_body

        return VespaRetriever(**config)