Source code for haive.core.engine.retriever.providers.ParentDocumentRetrieverConfig

"""Parent Document Retriever implementation for the Haive framework.

This module provides a configuration class for the Parent Document retriever,
which retrieves small chunks for embedding similarity but returns larger parent
documents containing those chunks, providing better context while maintaining
search precision.

The ParentDocumentRetriever works by:
1. Splitting documents into small chunks for embedding and similarity search
2. Storing these chunks in a vector store with references to parent documents
3. Storing full parent documents in a separate document store
4. When querying, finding similar chunks but returning their parent documents

This retriever is particularly useful when:
- Need precise similarity search on small chunks
- Want to return full context from larger parent documents
- Building systems that balance search precision with context completeness
- Dealing with long documents that need chunk-level search

The implementation integrates with LangChain's ParentDocumentRetriever while
providing a consistent Haive configuration interface with flexible chunking options.
"""

from typing import Any

from pydantic import Field, field_validator

from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType
from haive.core.engine.vectorstore.vectorstore import VectorStoreConfig



[docs]
@BaseRetrieverConfig.register(RetrieverType.PARENT_DOCUMENT)
class ParentDocumentRetrieverConfig(BaseRetrieverConfig):
    """Configuration for Parent Document retriever in the Haive framework.

    This retriever retrieves small chunks for similarity search but returns larger
    parent documents, providing better context while maintaining search precision.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always PARENT_DOCUMENT).
        vectorstore_config (VectorStoreConfig): Vector store for storing child chunks.
        docstore_type (str): Type of document store for parent documents.
        child_chunk_size (int): Size of child chunks for embedding.
        child_chunk_overlap (int): Overlap between child chunks.
        k (int): Number of child chunks to retrieve (returns their parents).

    Examples:
        >>> from haive.core.engine.retriever import ParentDocumentRetrieverConfig
        >>> from haive.core.engine.vectorstore.providers.ChromaVectorStoreConfig import ChromaVectorStoreConfig
        >>>
        >>> # Create vector store config
        >>> vs_config = ChromaVectorStoreConfig(
        ...     name="parent_doc_store",
        ...     collection_name="child_chunks"
        ... )
        >>>
        >>> # Create parent document retriever
        >>> config = ParentDocumentRetrieverConfig(
        ...     name="parent_doc_retriever",
        ...     vectorstore_config=vs_config,
        ...     child_chunk_size=200,
        ...     child_chunk_overlap=20,
        ...     k=4
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("machine learning algorithms")
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.PARENT_DOCUMENT, description="The type of retriever"
    )

    # Core configuration
    vectorstore_config: VectorStoreConfig = Field(
        ..., description="Vector store configuration for storing child chunks"
    )

    # Document storage configuration
    docstore_type: str = Field(
        default="in_memory",
        description="Type of document store for parent documents: 'in_memory', 'file_system'",
    )

    docstore_path: str | None = Field(
        default=None,
        description="Path for file system document store (required if docstore_type='file_system')",
    )

    # Child chunk parameters
    child_chunk_size: int = Field(
        default=200,
        ge=50,
        le=2000,
        description="Size of child chunks for embedding and similarity search",
    )

    child_chunk_overlap: int = Field(
        default=20, ge=0, le=500, description="Overlap between child chunks"
    )

    # Retrieval parameters
    k: int = Field(
        default=4,
        ge=1,
        le=100,
        description="Number of child chunks to retrieve (returns their parent documents)",
    )


[docs]
    @field_validator("docstore_type")
    @classmethod
    def validate_docstore_type(cls, v):
        """Validate document store type."""
        valid_types = ["in_memory", "file_system"]
        if v not in valid_types:
            raise TypeError(f"docstore_type must be one of {valid_types}, got {v}")
        return v



[docs]
    @field_validator("child_chunk_overlap")
    @classmethod
    def validate_child_chunk_overlap(cls, v, info):
        """Validate that child chunk overlap is less than chunk size."""
        # Note: In Pydantic v2, cross-field validation requires model_validator
        # This validator only checks individual field constraints
        if v < 0:
            raise ValueError(f"child_chunk_overlap ({v}) must be non-negative")
        return v



[docs]
    @field_validator("docstore_path")
    @classmethod
    def validate_docstore_path(cls, v, info):
        """Validate docstore path is provided when needed."""
        # Note: In Pydantic v2, cross-field validation requires model_validator
        # This validator only checks if docstore_path is provided
        return v



[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for Parent Document retriever."""
        return {
            "query": (str, Field(description="Query for parent document retrieval")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for Parent Document retriever."""
        return {
            "documents": (
                list[Any],  # List[Document] but avoiding import
                Field(
                    default_factory=list,
                    description="Parent documents of retrieved child chunks",
                ),
            ),
        }



[docs]
    def instantiate(self):
        """Create a Parent Document retriever from this configuration.

        Returns:
            ParentDocumentRetriever: Instantiated retriever ready for parent document retrieval.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If configuration is invalid.
        """
        try:
            from langchain.retrievers import ParentDocumentRetriever
            from langchain.storage import InMemoryStore
            from langchain.text_splitter import RecursiveCharacterTextSplitter
        except ImportError:
            raise ImportError(
                "ParentDocumentRetriever requires langchain package. "
                "Install with: pip install langchain"
            )

        # Instantiate the vector store
        try:
            vectorstore = self.vectorstore_config.instantiate()
        except Exception as e:
            raise ValueError(f"Failed to instantiate vector store: {e}")

        # Create the document store
        if self.docstore_type == "in_memory":
            docstore = InMemoryStore()
        elif self.docstore_type == "file_system":
            try:
                from langchain.storage import LocalFileStore

                if not self.docstore_path:
                    raise ValueError(
                        "docstore_path is required for file_system docstore"
                    )
                docstore = LocalFileStore(self.docstore_path)
            except ImportError:
                raise ImportError(
                    "File system document store requires additional packages. "
                    "Install with: pip install langchain[storage]"
                )
        else:
            raise TypeError(f"Unsupported docstore_type: {self.docstore_type}")

        # Create child splitter
        child_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.child_chunk_size,
            chunk_overlap=self.child_chunk_overlap,
        )

        return ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=docstore,
            child_splitter=child_splitter,
            search_kwargs={"k": self.k},
        )