"""Vespa Retriever implementation for the Haive framework.
from typing import Any
This module provides a configuration class for the Vespa retriever,
which uses Vespa search engine for advanced search and retrieval capabilities.
Vespa is a fully featured search engine and vector database which supports
vector search, lexical search, and hybrid ranking in a single query.
The VespaRetriever works by:
1. Connecting to a Vespa application
2. Supporting both vector and text search simultaneously
3. Providing advanced ranking and filtering capabilities
4. Enabling real-time search and content updates
This retriever is particularly useful when:
- Need hybrid search combining vector and text search
- Require real-time search with continuous updates
- Want advanced ranking and relevance tuning
- Building large-scale search applications
- Need both structured and unstructured data search
The implementation integrates with LangChain's Vespa retriever while
providing a consistent Haive configuration interface.
"""
from typing import Any
from langchain_core.documents import Document
from pydantic import Field
from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType
[docs]
@BaseRetrieverConfig.register(RetrieverType.VESPA)
class VespaRetrieverConfig(BaseRetrieverConfig):
"""Configuration for Vespa retriever in the Haive framework.
This retriever uses Vespa search engine to perform hybrid search
combining vector similarity and text search capabilities.
Attributes:
retriever_type (RetrieverType): The type of retriever (always VESPA).
url (str): Vespa application URL.
content_field (str): Field containing document content.
k (int): Number of documents to retrieve.
metadata_fields (List[str]): Fields to include in metadata.
vespa_query_body (Optional[Dict]): Custom Vespa query configuration.
Examples:
>>> from haive.core.engine.retriever import VespaRetrieverConfig
>>>
>>> # Create the Vespa retriever config
>>> config = VespaRetrieverConfig(
... name="vespa_retriever",
... url="http://localhost:8080",
... content_field="content",
... k=10,
... metadata_fields=["title", "author", "category"],
... vespa_query_body={
... "yql": "select * from sources * where userQuery()",
... "hits": 10,
... "ranking": "bm25"
... }
... )
>>>
>>> # Instantiate and use the retriever
>>> retriever = config.instantiate()
>>> docs = retriever.get_relevant_documents("machine learning neural networks")
>>>
>>> # Example with hybrid search
>>> hybrid_config = VespaRetrieverConfig(
... name="vespa_hybrid_retriever",
... url="http://localhost:8080",
... content_field="content",
... vespa_query_body={
... "yql": "select * from sources * where ({targetHits:10}nearestNeighbor(embedding,q)) or userQuery()",
... "ranking": "hybrid",
... "input.query(q)": "embed(@query)"
... }
... )
"""
retriever_type: RetrieverType = Field(
default=RetrieverType.VESPA, description="The type of retriever"
)
# Vespa connection configuration
url: str = Field(
..., description="Vespa application URL (e.g., 'http://localhost:8080')"
)
content_field: str = Field(
default="content", description="Field containing document content"
)
# Search parameters
k: int = Field(
default=10, ge=1, le=100, description="Number of documents to retrieve"
)
metadata_fields: list[str] = Field(
default_factory=list,
description="List of fields to include in document metadata",
)
# Vespa query configuration
vespa_query_body: dict[str, Any] | None = Field(
default=None, description="Custom Vespa query body configuration"
)
# Advanced parameters
ranking_profile: str = Field(
default="default", description="Vespa ranking profile to use"
)
query_model: str = Field(
default="simple", description="Query model: 'simple', 'all', 'any', 'weakAnd'"
)
timeout: float = Field(
default=30.0, ge=0.1, le=300.0, description="Query timeout in seconds"
)
[docs]
def get_output_fields(self) -> dict[str, tuple[type, Any]]:
"""Return output field definitions for Vespa retriever."""
return {
"documents": (
list[Document],
Field(default_factory=list, description="Documents from Vespa search"),
),
}
[docs]
def instantiate(self) -> Any:
"""Create a Vespa retriever from this configuration.
Returns:
VespaRetriever: Instantiated retriever ready for hybrid search.
Raises:
ImportError: If required packages are not available.
ValueError: If configuration is invalid.
"""
try:
from langchain_community.retrievers import VespaRetriever
except ImportError:
raise ImportError(
"VespaRetriever requires langchain-community and pyvespa packages. "
"Install with: pip install langchain-community pyvespa"
)
# Prepare configuration
config = {"url": self.url, "content_field": self.content_field, "k": self.k}
# Add metadata fields if specified
if self.metadata_fields:
config["metadata_fields"] = self.metadata_fields
# Configure query body
if self.vespa_query_body:
query_body = self.vespa_query_body.copy()
else:
# Default query body
query_body = {
"yql": "select * from sources * where userQuery()",
"hits": self.k,
"ranking": self.ranking_profile,
"type": self.query_model,
"timeout": f"{int(self.timeout * 1000)}ms",
}
config["vespa_query_body"] = query_body
return VespaRetriever(**config)