Source code for haive.core.engine.document.path_analysis

"""Path Analysis System for Document Loader Engine.

This module provides a path analysis system for the document loader engine,
which analyzes paths and URLs to determine their nature and properties.
"""

import contextlib
import logging
import mimetypes
import os
import re
from enum import Enum
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

from pydantic import BaseModel, Field, computed_field

logger = logging.getLogger(__name__)



[docs]
class PathType(str, Enum):
    """Primary path type classification."""

    LOCAL_FILE = "local_file"
    LOCAL_DIRECTORY = "local_directory"
    LOCAL_SYMLINK = "local_symlink"
    LOCAL_NONEXISTENT = "local_nonexistent"
    URL_HTTP = "url_http"
    URL_HTTPS = "url_https"
    URL_FTP = "url_ftp"
    URL_FILE = "url_file"
    DATABASE_URI = "database_uri"
    CLOUD_STORAGE = "cloud_storage"
    NETWORK_SHARE = "network_share"
    SPECIAL_PATH = "special_path"
    UNKNOWN = "unknown"




[docs]
class FileCategory(str, Enum):
    """High-level file category."""

    DOCUMENT = "document"
    IMAGE = "image"
    VIDEO = "video"
    AUDIO = "audio"
    CODE = "code"
    DATA = "data"
    ARCHIVE = "archive"
    EXECUTABLE = "executable"
    TEXT = "text"
    FONT = "font"
    MODEL = "model"
    SYSTEM = "system"
    UNKNOWN_FILE = "unknown_file"




[docs]
class DatabaseType(str, Enum):
    """Database type classification."""

    POSTGRESQL = "postgresql"
    MYSQL = "mysql"
    SQLITE = "sqlite"
    MONGODB = "mongodb"
    REDIS = "redis"
    ORACLE = "oracle"
    MSSQL = "mssql"
    CASSANDRA = "cassandra"
    ELASTICSEARCH = "elasticsearch"
    CLICKHOUSE = "clickhouse"
    SNOWFLAKE = "snowflake"
    BIGQUERY = "bigquery"
    DYNAMODB = "dynamodb"
    COUCHDB = "couchdb"
    INFLUXDB = "influxdb"
    UNKNOWN_DB = "unknown_db"




[docs]
class CloudProvider(str, Enum):
    """Cloud storage provider classification."""

    AWS_S3 = "aws_s3"
    AZURE_BLOB = "azure_blob"
    GOOGLE_CLOUD = "google_cloud"
    DROPBOX = "dropbox"
    BOX = "box"
    ONEDRIVE = "onedrive"
    ICLOUD = "icloud"
    BACKBLAZE = "backblaze"
    UNKNOWN_CLOUD = "unknown_cloud"




[docs]
class URLComponents(BaseModel):
    """Components of a URL."""

    scheme: str = ""
    netloc: str = ""
    hostname: str = ""
    port: int | None = None
    path: str = ""
    params: str = ""
    query: str = ""
    fragment: str = ""
    username: str | None = None
    password: str | None = None




[docs]
class DomainInfo(BaseModel):
    """Information about a domain."""

    domain: str = ""
    tld: str = ""
    subdomain: str = ""
    is_ip: bool = False
    is_localhost: bool = False




[docs]
class PathAnalysisResult(BaseModel):
    """Result of path analysis.

    This model contains comprehensive information about a path, including its type,
    properties, and metadata.
    """

    # Original input
    original_path: str = Field(
        ..., description="Original path string eqthat was analyzed"
    )

    # Primary classification
    path_type: PathType = Field(
        default=PathType.UNKNOWN, description="Primary path type classification"
    )

    # Basic properties
    is_local: bool = Field(
        default=False, description="Whether the path is on the local filesystem"
    )
    is_remote: bool = Field(
        default=False, description="Whether the path is remote (URL, cloud, etc.)"
    )
    is_file: bool = Field(default=False, description="Whether the path is a file")
    is_directory: bool = Field(
        default=False, description="Whether the path is a directory"
    )
    is_symlink: bool = Field(
        default=False, description="Whether the path is a symbolic link"
    )
    exists: bool = Field(default=False, description="Whether the path exists")

    # Path components
    normalized_path: str | None = Field(
        default=None, description="Normalized version of the path"
    )
    parent_path: str | None = Field(
        default=None, description="Parent directory of the path"
    )
    file_name: str | None = Field(default=None, description="File name (if applicable)")
    file_extension: str | None = Field(
        default=None, description="File extension (if applicable)"
    )

    # File properties
    file_size: int | None = Field(
        default=None, description="File size in bytes (if available)"
    )
    mime_type: str | None = Field(default=None, description="MIME type (if available)")
    encoding: str | None = Field(
        default=None, description="File encoding (if available)"
    )

    # Categorization
    file_category: FileCategory | None = Field(
        default=None, description="High-level file category"
    )

    # URL-specific properties
    url_components: URLComponents | None = Field(
        default=None, description="Components of the URL (if applicable)"
    )
    domain_info: DomainInfo | None = Field(
        default=None, description="Domain information (if applicable)"
    )

    # Database-specific properties
    database_type: DatabaseType | None = Field(
        default=None, description="Database type (if applicable)"
    )
    database_name: str | None = Field(
        default=None, description="Database name (if applicable)"
    )

    # Cloud storage properties
    cloud_provider: CloudProvider | None = Field(
        default=None, description="Cloud storage provider (if applicable)"
    )
    bucket_name: str | None = Field(
        default=None, description="Bucket name (if applicable)"
    )
    object_key: str | None = Field(
        default=None, description="Object key (if applicable)"
    )

    # Network properties
    is_secure: bool | None = Field(
        default=None, description="Whether the connection is secure (https, sftp, etc.)"
    )

    @computed_field
    def source_summary(self) -> str:
        """Generate a summary of the source."""
        if self.path_type in [PathType.LOCAL_FILE, PathType.LOCAL_DIRECTORY]:
            return f"Local {'directory' if self.is_directory else 'file'}: {self.original_path}"
        if self.path_type in [PathType.URL_HTTP, PathType.URL_HTTPS]:
            domain = self.domain_info.domain if self.domain_info else "unknown"
            return f"Web URL ({domain}): {self.original_path}"
        if self.path_type == PathType.DATABASE_URI:
            return f"Database ({self.database_type}): {self.original_path}"
        if self.path_type == PathType.CLOUD_STORAGE:
            return f"Cloud storage ({self.cloud_provider}): {self.original_path}"
        return f"Unknown source: {self.original_path}"



# File extension to MIME type mappings (in addition to standard ones)
EXTRA_MIME_TYPES = {
    ".md": "text/markdown",
    ".ipynb": "application/x-ipynb+json",
    ".yaml": "application/x-yaml",
    ".yml": "application/x-yaml",
    ".toml": "application/toml",
    ".rst": "text/x-rst",
    ".epub": "application/epub+zip",
}

# File extension to category mappings
FILE_CATEGORY_MAP = {
    # Documents
    ".pdf": FileCategory.DOCUMENT,
    ".doc": FileCategory.DOCUMENT,
    ".docx": FileCategory.DOCUMENT,
    ".odt": FileCategory.DOCUMENT,
    ".rtf": FileCategory.DOCUMENT,
    ".epub": FileCategory.DOCUMENT,
    ".md": FileCategory.DOCUMENT,
    ".markdown": FileCategory.DOCUMENT,
    ".rst": FileCategory.DOCUMENT,
    ".txt": FileCategory.TEXT,
    ".html": FileCategory.TEXT,
    ".htm": FileCategory.TEXT,
    # Data formats
    ".csv": FileCategory.DATA,
    ".json": FileCategory.DATA,
    ".xml": FileCategory.DATA,
    ".yaml": FileCategory.DATA,
    ".yml": FileCategory.DATA,
    ".toml": FileCategory.DATA,
    ".ini": FileCategory.DATA,
    ".conf": FileCategory.DATA,
    ".sqlite": FileCategory.DATA,
    ".db": FileCategory.DATA,
    ".xlsx": FileCategory.DATA,
    ".xls": FileCategory.DATA,
    ".ods": FileCategory.DATA,
    # Code
    ".py": FileCategory.CODE,
    ".js": FileCategory.CODE,
    ".ts": FileCategory.CODE,
    ".java": FileCategory.CODE,
    ".c": FileCategory.CODE,
    ".cpp": FileCategory.CODE,
    ".cs": FileCategory.CODE,
    ".go": FileCategory.CODE,
    ".rs": FileCategory.CODE,
    ".rb": FileCategory.CODE,
    ".php": FileCategory.CODE,
    ".ipynb": FileCategory.CODE,
    # Images
    ".jpg": FileCategory.IMAGE,
    ".jpeg": FileCategory.IMAGE,
    ".png": FileCategory.IMAGE,
    ".gif": FileCategory.IMAGE,
    ".bmp": FileCategory.IMAGE,
    ".tiff": FileCategory.IMAGE,
    ".webp": FileCategory.IMAGE,
    ".svg": FileCategory.IMAGE,
    ".ico": FileCategory.IMAGE,
    # Video
    ".mp4": FileCategory.VIDEO,
    ".avi": FileCategory.VIDEO,
    ".mov": FileCategory.VIDEO,
    ".mkv": FileCategory.VIDEO,
    ".webm": FileCategory.VIDEO,
    ".wmv": FileCategory.VIDEO,
    ".flv": FileCategory.VIDEO,
    # Audio
    ".mp3": FileCategory.AUDIO,
    ".wav": FileCategory.AUDIO,
    ".ogg": FileCategory.AUDIO,
    ".flac": FileCategory.AUDIO,
    ".aac": FileCategory.AUDIO,
    ".m4a": FileCategory.AUDIO,
    # Archives
    ".zip": FileCategory.ARCHIVE,
    ".tar": FileCategory.ARCHIVE,
    ".gz": FileCategory.ARCHIVE,
    ".7z": FileCategory.ARCHIVE,
    ".rar": FileCategory.ARCHIVE,
    ".bz2": FileCategory.ARCHIVE,
    ".xz": FileCategory.ARCHIVE,
    # Executables
    ".exe": FileCategory.EXECUTABLE,
    ".app": FileCategory.EXECUTABLE,
    ".bin": FileCategory.EXECUTABLE,
    ".sh": FileCategory.EXECUTABLE,
    ".bat": FileCategory.EXECUTABLE,
    ".dll": FileCategory.EXECUTABLE,
    ".so": FileCategory.EXECUTABLE,
    # Fonts
    ".ttf": FileCategory.FONT,
    ".otf": FileCategory.FONT,
    ".woff": FileCategory.FONT,
    ".woff2": FileCategory.FONT,
    ".eot": FileCategory.FONT,
    # Models
    ".onnx": FileCategory.MODEL,
    ".pkl": FileCategory.MODEL,
    ".h5": FileCategory.MODEL,
    ".pth": FileCategory.MODEL,
    ".pt": FileCategory.MODEL,
    ".pb": FileCategory.MODEL,
    ".tflite": FileCategory.MODEL,
    # System files
    ".sys": FileCategory.SYSTEM,
    ".log": FileCategory.SYSTEM,
    ".tmp": FileCategory.SYSTEM,
    ".bak": FileCategory.SYSTEM,
    ".cache": FileCategory.SYSTEM,
    ".config": FileCategory.SYSTEM,
}

# Database URI patterns
DATABASE_URI_PATTERNS = {
    r"^postgresql://": DatabaseType.POSTGRESQL,
    r"^postgres://": DatabaseType.POSTGRESQL,
    r"^mysql://": DatabaseType.MYSQL,
    r"^mariadb://": DatabaseType.MYSQL,
    r"^sqlite://": DatabaseType.SQLITE,
    r"^mongodb://": DatabaseType.MONGODB,
    r"^mongodb\+srv://": DatabaseType.MONGODB,
    r"^redis://": DatabaseType.REDIS,
    r"^oracle://": DatabaseType.ORACLE,
    r"^mssql://": DatabaseType.MSSQL,
    r"^sqlserver://": DatabaseType.MSSQL,
    r"^cassandra://": DatabaseType.CASSANDRA,
    r"^elasticsearch://": DatabaseType.ELASTICSEARCH,
    r"^clickhouse://": DatabaseType.CLICKHOUSE,
    r"^snowflake://": DatabaseType.SNOWFLAKE,
    r"^bigquery://": DatabaseType.BIGQUERY,
    r"^dynamodb://": DatabaseType.DYNAMODB,
    r"^couchdb://": DatabaseType.COUCHDB,
    r"^influxdb://": DatabaseType.INFLUXDB,
}

# Cloud storage patterns
CLOUD_STORAGE_PATTERNS = {
    r"^s3://": CloudProvider.AWS_S3,
    r"^s3a://": CloudProvider.AWS_S3,
    r"^s3n://": CloudProvider.AWS_S3,
    r"^azure://": CloudProvider.AZURE_BLOB,
    r"^wasb://": CloudProvider.AZURE_BLOB,
    r"^wasbs://": CloudProvider.AZURE_BLOB,
    r"^adl://": CloudProvider.AZURE_BLOB,
    r"^gs://": CloudProvider.GOOGLE_CLOUD,
    r"^gcs://": CloudProvider.GOOGLE_CLOUD,
    r"^dropbox://": CloudProvider.DROPBOX,
    r"^box://": CloudProvider.BOX,
    r"^onedrive://": CloudProvider.ONEDRIVE,
    r"^icloud://": CloudProvider.ICLOUD,
    r"^b2://": CloudProvider.BACKBLAZE,
}



[docs]
def detect_mime_type(file_path: str) -> str | None:
    """Detect the MIME type of a file.

    Args:
        file_path: Path to the file

    Returns:
        MIME type string, or None if unable to determine
    """
    # Register additional MIME types
    for ext, mime_type in EXTRA_MIME_TYPES.items():
        mimetypes.add_type(mime_type, ext)

    # Get MIME type from file extension
    mime_type, _ = mimetypes.guess_type(file_path)

    # If not found and file exists, try to detect from content
    if mime_type is None and os.path.exists(file_path):
        try:
            import magic

            mime_type = magic.from_file(file_path, mime=True)
        except (ImportError, Exception):
            # python-magic not available or error
            pass

    return mime_type




[docs]
def is_binary_file(file_path: str) -> bool:
    """Check if a file is binary.

    Args:
        file_path: Path to the file

    Returns:
        True if the file is binary, False otherwise
    """
    if not os.path.exists(file_path) or not os.path.isfile(file_path):
        return False

    # Check file extension first
    _, ext = os.path.splitext(file_path)
    if ext.lower() in [
        ".pdf",
        ".docx",
        ".xlsx",
        ".pptx",
        ".jpg",
        ".jpeg",
        ".png",
        ".gif",
        ".zip",
        ".tar",
        ".gz",
        ".mp3",
        ".mp4",
        ".exe",
    ]:
        return True

    # Check MIME type
    mime_type = detect_mime_type(file_path)
    if mime_type and not mime_type.startswith(
        ("text/", "application/json", "application/xml")
    ):
        return True

    # Check content (first few KB)
    try:
        with open(file_path, "rb") as f:
            chunk = f.read(4096)
            if b"\x00" in chunk:  # Null bytes indicate binary
                return True

            # Try to decode as text
            try:
                chunk.decode("utf-8")
                return False
            except UnicodeDecodeError:
                return True
    except (OSError, Exception):
        # If we can't read the file, assume it's not binary
        pass

    return False




[docs]
def detect_encoding(file_path: str) -> str | None:
    """Detect the encoding of a text file.

    Args:
        file_path: Path to the file

    Returns:
        Encoding name, or None if unable to determine
    """
    if not os.path.exists(file_path) or not os.path.isfile(file_path):
        return None

    if is_binary_file(file_path):
        return None

    try:
        import chardet

        with open(file_path, "rb") as f:
            result = chardet.detect(f.read(4096))
        return result["encoding"] if result["confidence"] > 0.7 else None
    except (ImportError, Exception):
        # chardet not available or error
        pass

    # Default to UTF-8
    return "utf-8"




[docs]
def extract_url_components(url: str) -> URLComponents:
    """Extract components from a URL.

    Args:
        url: URL string

    Returns:
        URLComponents object
    """
    if not url:
        return URLComponents()

    try:
        parsed = urlparse(url)

        # Extract hostname and port
        hostname = parsed.hostname or ""
        port = parsed.port

        return URLComponents(
            scheme=parsed.scheme,
            netloc=parsed.netloc,
            hostname=hostname,
            port=port,
            path=parsed.path,
            params=parsed.params,
            query=parsed.query,
            fragment=parsed.fragment,
            username=parsed.username,
            password=parsed.password,
        )
    except Exception:
        return URLComponents()




[docs]
def extract_domain_info(url_components: URLComponents) -> DomainInfo:
    """Extract domain information from URL components.

    Args:
        url_components: URLComponents object

    Returns:
        DomainInfo object
    """
    hostname = url_components.hostname or ""

    # Check if it's an IP address
    is_ip = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", hostname) is not None

    # Check if it's localhost
    is_localhost = hostname in ["localhost", "127.0.0.1", "::1"]

    # Extract domain and subdomain
    if is_ip or is_localhost:
        domain = hostname
        tld = ""
        subdomain = ""
    else:
        parts = hostname.split(".")

        # Handle cases like example.com, www.example.com, sub.example.com
        if len(parts) == 1:
            domain = parts[0]
            tld = ""
            subdomain = ""
        elif len(parts) == 2:
            domain = f"{parts[0]}.{parts[1]}"
            tld = parts[1]
            subdomain = ""
        else:
            tld = parts[-1]
            domain = f"{parts[-2]}.{parts[-1]}"
            subdomain = ".".join(parts[:-2])

    return DomainInfo(
        domain=domain,
        tld=tld,
        subdomain=subdomain,
        is_ip=is_ip,
        is_localhost=is_localhost,
    )



def extract_database_info(uri: str, db_type: DatabaseType) -> dict[str, Any]:
    """Extract database information from a URI.

    Args:
        uri: Database URI
        db_type: Database type

    Returns:
        Dictionary with database information
    """
    components = extract_url_components(uri)
    info = {"database_type": db_type, "connection_uri": uri}

    # Extract database name from path
    if components.path and components.path != "/":
        db_name = components.path.lstrip("/")
        if db_name:
            info["database_name"] = db_name

    # Extract host and port
    if components.hostname:
        info["host"] = components.hostname
    if components.port:
        info["port"] = components.port

    # Extract credentials
    if components.username:
        info["username"] = components.username

    return info


def extract_cloud_storage_info(uri: str, provider: CloudProvider) -> dict[str, Any]:
    """Extract cloud storage information from a URI.

    Args:
        uri: Cloud storage URI
        provider: Cloud provider

    Returns:
        Dictionary with cloud storage information
    """
    info = {"cloud_provider": provider, "uri": uri}

    # Parse URI to extract bucket and key
    parts = uri.split("://", 1)[1].split("/", 1)

    if parts:
        info["bucket_name"] = parts[0]
        if len(parts) > 1:
            info["object_key"] = parts[1]

    return info



[docs]
def analyze_local_path(path: str) -> PathAnalysisResult:
    """Analyze a local filesystem path.

    Args:
        path: Path to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(
        original_path=path,
        is_local=True,
        normalized_path=os.path.normpath(path),
        parent_path=os.path.dirname(path),
    )

    # Check if path exists
    if os.path.exists(path):
        result.exists = True

        # Check if it's a symlink
        if os.path.islink(path):
            result.path_type = PathType.LOCAL_SYMLINK
            result.is_symlink = True

            # Determine if the target is a file or directory
            target_path = os.path.realpath(path)
            if os.path.isfile(target_path):
                result.is_file = True
            elif os.path.isdir(target_path):
                result.is_directory = True

        # Check if it's a file
        elif os.path.isfile(path):
            result.path_type = PathType.LOCAL_FILE
            result.is_file = True
            result.file_name = os.path.basename(path)

            # Get file extension
            _, ext = os.path.splitext(path)
            if ext:
                result.file_extension = ext.lower()

                # Get file category
                result.file_category = FILE_CATEGORY_MAP.get(
                    ext.lower(), FileCategory.UNKNOWN_FILE
                )

            # Get file size
            with contextlib.suppress(OSError, Exception):
                result.file_size = os.path.getsize(path)

            # Get MIME type
            result.mime_type = detect_mime_type(path)

            # Get encoding for text files
            if not is_binary_file(path):
                result.encoding = detect_encoding(path)

        # Check if it's a directory
        elif os.path.isdir(path):
            result.path_type = PathType.LOCAL_DIRECTORY
            result.is_directory = True
            result.file_name = os.path.basename(path)
    else:
        result.path_type = PathType.LOCAL_NONEXISTENT

    return result




[docs]
def analyze_url(url: str) -> PathAnalysisResult:
    """Analyze a URL.

    Args:
        url: URL to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(original_path=url, is_remote=True)

    # Extract URL components
    components = extract_url_components(url)
    result.url_components = components

    # Extract domain information
    if components.hostname:
        result.domain_info = extract_domain_info(components)

    # Set URL-specific properties
    scheme = components.scheme.lower()

    if scheme == "http":
        result.path_type = PathType.URL_HTTP
        result.is_secure = False
    elif scheme == "https":
        result.path_type = PathType.URL_HTTPS
        result.is_secure = True
    elif scheme in ["ftp", "sftp"]:
        result.path_type = PathType.URL_FTP
        result.is_secure = scheme == "sftp"
    elif scheme == "file":
        result.path_type = PathType.URL_FILE
        result.is_local = True
        # Convert to local path for further analysis
        local_path = components.path
        local_result = analyze_local_path(local_path)

        # Copy relevant properties
        result.is_file = local_result.is_file
        result.is_directory = local_result.is_directory
        result.exists = local_result.exists
        result.file_name = local_result.file_name
        result.file_extension = local_result.file_extension
        result.file_category = local_result.file_category
        result.file_size = local_result.file_size
        result.mime_type = local_result.mime_type
        result.encoding = local_result.encoding

    # Extract file information from path
    if components.path and components.path != "/":
        file_path = components.path.rstrip("/")
        file_name = os.path.basename(file_path)
        if file_name:
            result.file_name = file_name

            # Get file extension
            _, ext = os.path.splitext(file_name)
            if ext:
                result.file_extension = ext.lower()

                # Get file category
                result.file_category = FILE_CATEGORY_MAP.get(
                    ext.lower(), FileCategory.UNKNOWN_FILE
                )

                # Assume it's a file if it has an extension
                result.is_file = True

    return result




[docs]
def analyze_database_uri(uri: str) -> PathAnalysisResult:
    """Analyze a database URI.

    Args:
        uri: Database URI to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(
        original_path=uri, path_type=PathType.DATABASE_URI, is_remote=True
    )

    # Extract URL components
    components = extract_url_components(uri)
    result.url_components = components

    # Determine database type
    db_type = DatabaseType.UNKNOWN_DB
    for pattern, db in DATABASE_URI_PATTERNS.items():
        if re.match(pattern, uri):
            db_type = db
            break

    result.database_type = db_type

    # Extract database name from path
    if components.path and components.path != "/":
        db_name = components.path.lstrip("/")
        if db_name:
            result.database_name = db_name

    return result




[docs]
def analyze_cloud_path(path: str) -> PathAnalysisResult:
    """Analyze a cloud storage path.

    Args:
        path: Cloud storage path to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(
        original_path=path, path_type=PathType.CLOUD_STORAGE, is_remote=True
    )

    # Determine cloud provider
    provider = CloudProvider.UNKNOWN_CLOUD
    for pattern, prov in CLOUD_STORAGE_PATTERNS.items():
        if re.match(pattern, path):
            provider = prov
            break

    result.cloud_provider = provider

    # Parse path to extract bucket and key
    try:
        scheme, rest = path.split("://", 1)
        parts = rest.split("/", 1)

        if parts:
            result.bucket_name = parts[0]
            if len(parts) > 1:
                result.object_key = parts[1]

                # Determine if it's a file or directory
                if result.object_key.endswith("/"):
                    result.is_directory = True
                else:
                    result.is_file = True

                    # Extract file name and extension
                    result.file_name = os.path.basename(result.object_key)
                    _, ext = os.path.splitext(result.file_name)
                    if ext:
                        result.file_extension = ext.lower()

                        # Get file category
                        result.file_category = FILE_CATEGORY_MAP.get(
                            ext.lower(), FileCategory.UNKNOWN_FILE
                        )
    except Exception:
        # Unable to parse, leave properties as defaults
        pass

    return result




[docs]
def analyze_network_path(path: str) -> PathAnalysisResult:
    """Analyze a network share path.

    Args:
        path: Network path to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(
        original_path=path, path_type=PathType.NETWORK_SHARE, is_remote=True
    )

    # Parse UNC path (\\server\share\path)
    if path.startswith("\\\\"):
        parts = path.lstrip("\\").split("\\")

        if len(parts) >= 2:
            server = parts[0]
            share = parts[1]

            # Store in URL components
            result.url_components = URLComponents(
                scheme="file",
                netloc=server,
                hostname=server,
                path=f"/{share}/{'/'.join(parts[2:]) if len(parts) > 2 else ''}",
            )

    # Determine if it's a file or directory
    if path.endswith(("\\", "/")):
        result.is_directory = True
    else:
        # Check file extension
        _, ext = os.path.splitext(path)
        if ext:
            result.is_file = True
            result.file_extension = ext.lower()
            result.file_name = os.path.basename(path)

            # Get file category
            result.file_category = FILE_CATEGORY_MAP.get(
                ext.lower(), FileCategory.UNKNOWN_FILE
            )

    return result




[docs]
def analyze_special_path(path: str) -> PathAnalysisResult:
    """Analyze a special path (e.g., git SSH URL).

    Args:
        path: Special path to analyze

    Returns:
        PathAnalysisResult object
    """
    # Basic result with common properties
    result = PathAnalysisResult(
        original_path=path, path_type=PathType.SPECIAL_PATH, is_remote=True
    )

    # Handle git SSH URLs (git@github.com:user/repo.git)
    if re.match(r"^git@[a-zA-Z0-9.-]+:[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\.git$", path):
        # Parse git SSH URL
        user_host, repo_path = path.split(":", 1)
        user, host = user_host.split("@", 1)

        # Store in URL components
        result.url_components = URLComponents(
            scheme="ssh",
            netloc=host,
            hostname=host,
            username=user,
            path=f"/{repo_path}",
        )

        # Store in domain info
        result.domain_info = extract_domain_info(result.url_components)

        # It's a Git repository
        result.is_file = False
        result.is_directory = True

    return result




[docs]
def analyze_path_comprehensive(path: str | Path) -> PathAnalysisResult:
    """Analyze a path comprehensively.

    This function analyzes a path to determine its type, properties, and metadata.
    It handles various path types including local files, URLs, database URIs, and
    cloud storage paths.

    Args:
        path: Path to analyze (string or Path object)

    Returns:
        PathAnalysisResult object with comprehensive information about the path
    """
    # Convert Path to string
    if isinstance(path, Path):
        path = str(path)

    # Normalize path
    path = path.strip()

    # Handle empty path
    if not path:
        return PathAnalysisResult(original_path="", path_type=PathType.UNKNOWN)

    # Check for URL
    if re.match(r"^(https?|ftp|sftp|file)://", path):
        return analyze_url(path)

    # Check for database URI
    for pattern in DATABASE_URI_PATTERNS:
        if re.match(pattern, path):
            return analyze_database_uri(path)

    # Check for cloud storage path
    for pattern in CLOUD_STORAGE_PATTERNS:
        if re.match(pattern, path):
            return analyze_cloud_path(path)

    # Check for network share (UNC path)
    if path.startswith("\\\\"):
        return analyze_network_path(path)

    # Check for special paths (like git SSH URLs)
    if re.match(r"^git@[a-zA-Z0-9.-]+:", path):
        return analyze_special_path(path)

    # Default to local path
    return analyze_local_path(path)



__all__ = [
    "CloudProvider",
    "DatabaseType",
    "DomainInfo",
    "FileCategory",
    "PathAnalysisResult",
    "PathType",
    "URLComponents",
    "analyze_cloud_path",
    "analyze_database_uri",
    "analyze_local_path",
    "analyze_network_path",
    "analyze_path_comprehensive",
    "analyze_special_path",
    "analyze_url",
    "detect_encoding",
    "detect_mime_type",
    "extract_domain_info",
    "extract_url_components",
    "is_binary_file",
]