Source code for haive.core.engine.retriever.providers.NeuralDBRetrieverConfig

"""NeuralDB Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the NeuralDB retriever,
which uses ThirdAI's NeuralDB for fast neural search without GPUs.
NeuralDB provides efficient neural information retrieval with CPU-only
inference and training capabilities.

The NeuralDBRetriever works by:
1. Using ThirdAI's NeuralDB engine for neural search
2. Performing efficient CPU-based neural retrieval
3. Supporting fast training and inference
4. Enabling neural search without GPU requirements

This retriever is particularly useful when:
- Need neural search without GPU infrastructure
- Want fast CPU-based neural retrieval
- Building cost-effective neural search systems
- Need efficient training on CPU
- Using ThirdAI's NeuralDB platform

The implementation integrates with LangChain's NeuralDBRetriever while
providing a consistent Haive configuration interface with secure API key management.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field, SecretStr

from haive.core.common.mixins.secure_config import SecureConfigMixin
from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType



[docs]
@BaseRetrieverConfig.register(RetrieverType.NEURAL_DB)
class NeuralDBRetrieverConfig(SecureConfigMixin, BaseRetrieverConfig):
    """Configuration for NeuralDB retriever in the Haive framework.

    This retriever uses ThirdAI's NeuralDB to provide fast neural search
    without requiring GPU infrastructure, enabling efficient CPU-based retrieval.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always NEURAL_DB).
        api_key (Optional[SecretStr]): ThirdAI API key (auto-resolved from THIRDAI_API_KEY).
        model_path (Optional[str]): Path to the NeuralDB model file.
        k (int): Number of documents to retrieve.
        documents (List[Document]): Documents to index for retrieval.
        training_steps (int): Number of training steps for the model.

    Examples:
        >>> from haive.core.engine.retriever import NeuralDBRetrieverConfig
        >>> from langchain_core.documents import Document
        >>>
        >>> # Create documents
        >>> docs = [
        ...     Document(page_content="Machine learning enables computers to learn"),
        ...     Document(page_content="Deep learning is a subset of machine learning"),
        ...     Document(page_content="Neural networks are inspired by the brain")
        ... ]
        >>>
        >>> # Create the NeuralDB retriever config
        >>> config = NeuralDBRetrieverConfig(
        ...     name="neuraldb_retriever",
        ...     documents=docs,
        ...     k=5,
        ...     training_steps=100
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("neural network learning")
        >>>
        >>> # Example with pre-trained model
        >>> pretrained_config = NeuralDBRetrieverConfig(
        ...     name="pretrained_neuraldb_retriever",
        ...     model_path="./my_neuraldb_model.pkl",
        ...     documents=docs,
        ...     k=3
        ... )
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.NEURAL_DB, description="The type of retriever"
    )

    # API configuration with SecureConfigMixin
    api_key: SecretStr | None = Field(
        default=None, description="ThirdAI API key (auto-resolved from THIRDAI_API_KEY)"
    )

    # Provider for SecureConfigMixin
    provider: str = Field(
        default="thirdai", description="Provider name for API key resolution"
    )

    # Model configuration
    model_path: str | None = Field(
        default=None,
        description="Path to the NeuralDB model file (if using pre-trained model)",
    )

    # Documents to index
    documents: list[Document] = Field(
        default_factory=list, description="Documents to index for NeuralDB retrieval"
    )

    # Search parameters
    k: int = Field(
        default=10, ge=1, le=100, description="Number of documents to retrieve"
    )

    # Training parameters
    training_steps: int = Field(
        default=100,
        ge=1,
        le=10000,
        description="Number of training steps for the NeuralDB model",
    )

    learning_rate: float = Field(
        default=0.001, ge=0.0001, le=0.1, description="Learning rate for training"
    )

    # NeuralDB specific parameters
    chunk_size: int = Field(
        default=1000, ge=100, le=4000, description="Size of text chunks for processing"
    )

    chunk_overlap: int = Field(
        default=100, ge=0, le=500, description="Overlap between text chunks"
    )

    # Advanced parameters
    batch_size: int = Field(
        default=32, ge=1, le=256, description="Batch size for training and inference"
    )

    max_length: int = Field(
        default=512,
        ge=128,
        le=2048,
        description="Maximum sequence length for processing",
    )


[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for NeuralDB retriever."""
        return {
            "query": (str, Field(description="Neural search query for NeuralDB")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for NeuralDB retriever."""
        return {
            "documents": (
                list[Document],
                Field(
                    default_factory=list, description="Documents from NeuralDB search"
                ),
            ),
        }



[docs]
    def instantiate(self) -> Any:
        """Create a NeuralDB retriever from this configuration.

        Returns:
            NeuralDBRetriever: Instantiated retriever ready for neural search.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If API key or configuration is invalid.
        """
        try:
            from langchain_community.retrievers import NeuralDBRetriever
        except ImportError:
            raise ImportError(
                "NeuralDBRetriever requires langchain-community and thirdai packages. "
                "Install with: pip install langchain-community thirdai"
            )

        # Get API key using SecureConfigMixin (if needed)
        api_key = self.get_api_key()

        # Prepare configuration
        config = {"k": self.k}

        # Add API key if available
        if api_key:
            config["thirdai_key"] = api_key

        # Handle model path or documents
        if self.model_path:
            # Load from pre-trained model
            config["model_path"] = self.model_path
        else:
            # Train from documents
            if not self.documents:
                raise ValueError(
                    "NeuralDBRetriever requires either a model_path or documents for training."
                )

            config["documents"] = self.documents
            config["training_steps"] = self.training_steps
            config["learning_rate"] = self.learning_rate
            config["chunk_size"] = self.chunk_size
            config["chunk_overlap"] = self.chunk_overlap

        # Add advanced parameters
        config["batch_size"] = self.batch_size
        config["max_length"] = self.max_length

        return NeuralDBRetriever(**config)