Source code for haive.core.engine.retriever.providers.LlamaIndexRetrieverConfig

"""LlamaIndex Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the LlamaIndex retriever,
which integrates LlamaIndex's retrieval capabilities with LangChain.
LlamaIndex provides a data framework for LLM applications with
sophisticated indexing and retrieval mechanisms.

The LlamaIndexRetriever works by:
1. Using LlamaIndex's retrieval engines
2. Supporting various index types (vector, keyword, graph, etc.)
3. Enabling sophisticated query processing
4. Providing LlamaIndex-specific optimizations

This retriever is particularly useful when:
- Integrating LlamaIndex with LangChain workflows
- Need LlamaIndex's advanced indexing capabilities
- Want to leverage LlamaIndex's query engines
- Building complex retrieval pipelines
- Using LlamaIndex's data connectors

The implementation integrates LlamaIndex retrievers with LangChain while
providing a consistent Haive configuration interface.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType



[docs]
@BaseRetrieverConfig.register(RetrieverType.LLAMA_INDEX)
class LlamaIndexRetrieverConfig(BaseRetrieverConfig):
    """Configuration for LlamaIndex retriever in the Haive framework.

    This retriever integrates LlamaIndex's retrieval capabilities with LangChain,
    enabling the use of LlamaIndex's sophisticated indexing and query mechanisms.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always LLAMA_INDEX).
        index_path (Optional[str]): Path to a persisted LlamaIndex index.
        documents (List[Document]): Documents to index (if not loading from path).
        k (int): Number of documents to retrieve.
        index_type (str): Type of LlamaIndex index to create.
        similarity_top_k (int): Top-k for similarity search.

    Examples:
        >>> from haive.core.engine.retriever import LlamaIndexRetrieverConfig
        >>> from langchain_core.documents import Document
        >>>
        >>> # Create documents
        >>> docs = [
        ...     Document(page_content="LlamaIndex provides data framework for LLMs"),
        ...     Document(page_content="Vector stores enable semantic search"),
        ...     Document(page_content="Graph indexes capture relationships")
        ... ]
        >>>
        >>> # Create the LlamaIndex retriever config
        >>> config = LlamaIndexRetrieverConfig(
        ...     name="llamaindex_retriever",
        ...     documents=docs,
        ...     k=5,
        ...     index_type="vector",
        ...     similarity_top_k=10
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("semantic search with vectors")
        >>>
        >>> # Example with graph index
        >>> graph_config = LlamaIndexRetrieverConfig(
        ...     name="llamaindex_graph_retriever",
        ...     documents=docs,
        ...     index_type="knowledge_graph",
        ...     k=3
        ... )
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.LLAMA_INDEX, description="The type of retriever"
    )

    # Index configuration
    index_path: str | None = Field(
        default=None,
        description="Path to a persisted LlamaIndex index (if loading existing index)",
    )

    documents: list[Document] = Field(
        default_factory=list, description="Documents to index (if creating new index)"
    )

    # Search parameters
    k: int = Field(
        default=10, ge=1, le=100, description="Number of documents to retrieve"
    )

    similarity_top_k: int = Field(
        default=10,
        ge=1,
        le=100,
        description="Top-k for similarity search in LlamaIndex",
    )

    # LlamaIndex specific parameters
    index_type: str = Field(
        default="vector",
        description="Type of LlamaIndex index: 'vector', 'keyword', 'knowledge_graph', 'list'",
    )

    chunk_size: int = Field(
        default=1000, ge=100, le=4000, description="Size of text chunks for indexing"
    )

    chunk_overlap: int = Field(
        default=100, ge=0, le=500, description="Overlap between text chunks"
    )

    # Query engine parameters
    response_mode: str = Field(
        default="compact",
        description="Response mode: 'compact', 'refine', 'tree_summarize'",
    )

    # Advanced parameters
    embed_model: str | None = Field(
        default=None,
        description="Embedding model for LlamaIndex (e.g., 'local:BAAI/bge-small-en-v1.5')",
    )

    llm_model: str | None = Field(
        default=None, description="LLM model for LlamaIndex query processing"
    )


[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for LlamaIndex retriever."""
        return {
            "query": (str, Field(description="Query for LlamaIndex retrieval")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for LlamaIndex retriever."""
        return {
            "documents": (
                list[Document],
                Field(
                    default_factory=list,
                    description="Documents from LlamaIndex retrieval",
                ),
            ),
        }



[docs]
    def instantiate(self) -> Any:
        """Create a LlamaIndex retriever from this configuration.

        Returns:
            LlamaIndexRetriever: Instantiated retriever ready for LlamaIndex-powered search.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If configuration is invalid.
        """
        try:
            from langchain_community.retrievers import LlamaIndexRetriever
        except ImportError:
            raise ImportError(
                "LlamaIndexRetriever requires langchain-community and llama-index packages. "
                "Install with: pip install langchain-community llama-index"
            )

        # Prepare configuration
        config = {"k": self.k}

        # Handle index loading or creation
        if self.index_path:
            # Load from existing index
            config["index_path"] = self.index_path
        else:
            # Create new index from documents
            if not self.documents:
                raise ValueError(
                    "LlamaIndexRetriever requires either index_path or documents."
                )

            config["documents"] = self.documents
            config["index_type"] = self.index_type
            config["chunk_size"] = self.chunk_size
            config["chunk_overlap"] = self.chunk_overlap

        # Add search parameters
        config["similarity_top_k"] = self.similarity_top_k
        config["response_mode"] = self.response_mode

        # Add model configurations
        if self.embed_model:
            config["embed_model"] = self.embed_model

        if self.llm_model:
            config["llm_model"] = self.llm_model

        return LlamaIndexRetriever(**config)