Source code for haive.core.engine.embedding.providers.HuggingFaceEmbeddingConfig

"""HuggingFace embedding configuration."""

import os
from typing import Any

from pydantic import Field, field_validator

from haive.core.engine.embedding.base import BaseEmbeddingConfig
from haive.core.engine.embedding.types import EmbeddingType


[docs] @BaseEmbeddingConfig.register(EmbeddingType.HUGGINGFACE) class HuggingFaceEmbeddingConfig(BaseEmbeddingConfig): """Configuration for HuggingFace embeddings. This configuration provides access to HuggingFace embedding models including sentence transformers and other transformer-based embedding models. Examples: Basic usage: .. code-block:: python config = HuggingFaceEmbeddingConfig( name="hf_embeddings", model="sentence-transformers/all-MiniLM-L6-v2" ) embeddings = config.instantiate() With GPU support:: config = HuggingFaceEmbeddingConfig( name="hf_embeddings", model="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True} ) With caching:: config = HuggingFaceEmbeddingConfig( name="hf_embeddings", model="sentence-transformers/all-MiniLM-L6-v2", use_cache=True, cache_folder="./embedding_cache" ) Attributes: embedding_type: Always EmbeddingType.HUGGINGFACE model: HuggingFace model name or path model_kwargs: Additional arguments for model initialization encode_kwargs: Additional arguments for encoding use_cache: Whether to use embedding caching cache_folder: Directory for caching embeddings """ embedding_type: EmbeddingType = Field( default=EmbeddingType.HUGGINGFACE, description="The embedding provider type" ) # HuggingFace-specific fields model_kwargs: dict[str, Any] = Field( default_factory=lambda: {"device": "cpu"}, description="Additional arguments for model initialization", ) encode_kwargs: dict[str, Any] = Field( default_factory=dict, description="Additional arguments for encoding" ) multi_process: bool = Field( default=False, description="Whether to use multi-processing for encoding" ) show_progress: bool = Field( default=False, description="Whether to show progress bar during encoding" ) use_cache: bool = Field( default=True, description="Whether to use embedding caching" ) cache_folder: str | None = Field( default=None, description="Directory for caching embeddings" ) # Trust remote code (for custom models) trust_remote_code: bool = Field( default=False, description="Whether to trust remote code for custom models" ) # SecureConfigMixin configuration provider: str = Field( default="huggingface", description="Provider name for API key resolution" )
[docs] @field_validator("model") @classmethod def validate_model(cls, v) -> Any: """Validate the HuggingFace model name.""" if not v or not v.strip(): raise ValueError("Model name is required and cannot be empty") return v.strip()
[docs] @field_validator("model_kwargs") @classmethod def validate_model_kwargs(cls, v) -> Any: """Validate and set default model kwargs.""" if not v: v = {} # Auto-detect device if not specified if "device" not in v: try: import torch v["device"] = "cuda" if torch.cuda.is_available() else "cpu" except ImportError: v["device"] = "cpu" return v
[docs] @field_validator("cache_folder") @classmethod def validate_cache_folder(cls, v, values) -> Any: """Set default cache folder if not specified.""" if v is None and values.get("use_cache", True): # Use a default cache folder v = os.path.expanduser("~/.cache/haive/embeddings") return v
[docs] def instantiate(self) -> Any: """Create a HuggingFace embeddings instance. Returns: HuggingFaceEmbeddings instance configured with the provided parameters Raises: ImportError: If required packages are not installed ValueError: If configuration is invalid """ try: from langchain_huggingface import HuggingFaceEmbeddings except ImportError: raise ImportError( "HuggingFace embeddings require the langchain-huggingface package. " "Install with: pip install langchain-huggingface" ) # Validate configuration self.validate_configuration() # Build kwargs kwargs = { "model_name": self.model, "model_kwargs": self.model_kwargs, "encode_kwargs": self.encode_kwargs, "multi_process": self.multi_process, "show_progress": self.show_progress, } # Add cache folder if specified if self.cache_folder: kwargs["cache_folder"] = self.cache_folder # Add trust remote code if specified if self.trust_remote_code: kwargs["trust_remote_code"] = self.trust_remote_code # Create base embeddings embeddings = HuggingFaceEmbeddings(**kwargs) # Add caching if enabled if self.use_cache and self.cache_folder: try: from langchain.embeddings import CacheBackedEmbeddings from langchain.storage import LocalFileStore # Create cache store store = LocalFileStore(self.cache_folder) # Wrap with cache embeddings = CacheBackedEmbeddings.from_bytes_store( embeddings, document_embedding_cache=store, query_embedding_cache=True, namespace=self.model.replace("/", "_"), ) except ImportError: # If caching dependencies not available, continue without # caching import logging logger = logging.getLogger(__name__) logger.warning("Caching not available - continuing without cache") return embeddings
[docs] def validate_configuration(self) -> None: """Validate the configuration before instantiation.""" super().validate_configuration() if not self.model: raise ValueError("Model name is required") # Validate cache folder if caching is enabled if self.use_cache and self.cache_folder: try: os.makedirs(self.cache_folder, exist_ok=True) except OSError as e: raise ValueError(f"Cannot create cache folder {self.cache_folder}: {e}")
[docs] def get_default_model(self) -> str: """Get the default model for HuggingFace embeddings.""" return "sentence-transformers/all-MiniLM-L6-v2"
[docs] def get_supported_models(self) -> list[str]: """Get list of popular HuggingFace embedding models.""" return [ "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-roberta-large-v1", "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "sentence-transformers/multi-qa-mpnet-base-cos-v1", "sentence-transformers/paraphrase-MiniLM-L6-v2", "sentence-transformers/paraphrase-mpnet-base-v2", "sentence-transformers/msmarco-distilbert-base-v4", "BAAI/bge-large-en-v1.5", "BAAI/bge-base-en-v1.5", "BAAI/bge-small-en-v1.5", "intfloat/e5-large-v2", "intfloat/e5-base-v2", "intfloat/e5-small-v2", "thenlper/gte-large", "thenlper/gte-base", ]
[docs] def get_model_info(self) -> dict: """Get information about the configured model.""" # Common model information model_info = { "sentence-transformers/all-MiniLM-L6-v2": { "dimensions": 384, "max_seq_length": 256, "description": "Fast, lightweight model with good performance", }, "sentence-transformers/all-mpnet-base-v2": { "dimensions": 768, "max_seq_length": 384, "description": "High-quality all-round model", }, "sentence-transformers/all-roberta-large-v1": { "dimensions": 1024, "max_seq_length": 512, "description": "Large model with high performance", }, "BAAI/bge-large-en-v1.5": { "dimensions": 1024, "max_seq_length": 512, "description": "BAAI's large English embedding model", }, "BAAI/bge-base-en-v1.5": { "dimensions": 768, "max_seq_length": 512, "description": "BAAI's base English embedding model", }, "BAAI/bge-small-en-v1.5": { "dimensions": 384, "max_seq_length": 512, "description": "BAAI's small English embedding model", }, "intfloat/e5-large-v2": { "dimensions": 1024, "max_seq_length": 512, "description": "E5 large model with strong performance", }, "intfloat/e5-base-v2": { "dimensions": 768, "max_seq_length": 512, "description": "E5 base model with good balance", }, "thenlper/gte-large": { "dimensions": 1024, "max_seq_length": 512, "description": "GTE large model with high performance", }, } return model_info.get( self.model, { "dimensions": "unknown", "max_seq_length": "unknown", "description": "HuggingFace embedding model", }, )