"""
UMAP Model Service for 2D Visualization (Issue #458)

.. deprecated:: ADR-004
    Este servicio está deprecado desde ADR-004 (Dic 2024).
    El clustering dinámico y visualización UMAP han sido reemplazados
    por grupos curados manualmente usando `IntercambiableGroup` con
    `is_curated=True`. Este módulo se mantendrá hasta completar la
    migración pero no debe usarse para nuevos desarrollos.

Manages UMAP model training, versioning, and coordinate retrieval for the
clustering dashboard visualization.

Features:
- Train UMAP 2D projections with warm start support
- Version tracking for model stability
- Coordinate caching in ProductCatalogVentaLibre
- Z-score calculation for outlier detection
"""

import os
import pickle
import warnings
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import structlog
from sqlalchemy import func
from sqlalchemy.orm import Session

from app.models.product_catalog_venta_libre import ProductCatalogVentaLibre

logger = structlog.get_logger(__name__)

# =============================================================================
# OPTIONAL ML DEPENDENCIES
# =============================================================================

HAS_UMAP = False
HAS_SENTENCE_TRANSFORMERS = False

try:
    import umap
    HAS_UMAP = True
except ImportError:
    pass

try:
    from sentence_transformers import SentenceTransformer
    HAS_SENTENCE_TRANSFORMERS = True
except ImportError:
    pass

try:
    from sklearn.metrics.pairwise import cosine_distances
    HAS_SKLEARN = True
except ImportError:
    HAS_SKLEARN = False


# =============================================================================
# CONFIGURATION
# =============================================================================

class UMAPModelConfig:
    """Configuration for UMAP model management."""

    # Model storage
    MODEL_DIR = Path("models/umap")
    MODEL_PREFIX = "umap_2d"

    # Retrain thresholds (from Issue #458 comment)
    RETRAIN_THRESHOLD_LOCKED_RATIO = 0.80  # Retrain when 80% clusters locked
    NOTIFY_BEFORE_RETRAIN = True
    NOTIFICATION_LEAD_TIME_HOURS = 24

    # Warm start for stable layouts
    USE_WARM_START = True

    # UMAP parameters for 2D visualization
    N_COMPONENTS = 2
    N_NEIGHBORS = 15
    MIN_DIST = 0.1
    METRIC = "cosine"
    RANDOM_STATE = 42

    # Embedding model
    EMBEDDING_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"


# =============================================================================
# SERVICE CLASS
# =============================================================================

class UMAPModelService:
    """
    Service for UMAP 2D visualization model management.

    .. deprecated:: ADR-004
        Este servicio está deprecado. Usar IntercambiableGroup con is_curated=True.

    Responsibilities:
    - Train UMAP models with version tracking
    - Load/save models to filesystem
    - Generate 2D coordinates for products
    - Calculate centroid distances (Z-scores)
    - Warm start support for stable layouts
    """

    def __init__(self, config: UMAPModelConfig = None):
        warnings.warn(
            "UMAPModelService está deprecado (ADR-004). "
            "El clustering dinámico y visualización UMAP han sido reemplazados "
            "por grupos curados. Usar IntercambiableGroup con is_curated=True.",
            DeprecationWarning,
            stacklevel=2,
        )

        self.config = config or UMAPModelConfig()
        self._embedding_model = None
        self._umap_model = None
        self._current_version: Optional[str] = None

        # Ensure model directory exists
        self.config.MODEL_DIR.mkdir(parents=True, exist_ok=True)

        self._capabilities = {
            "umap": HAS_UMAP,
            "sentence_transformers": HAS_SENTENCE_TRANSFORMERS,
            "sklearn": HAS_SKLEARN,
        }

        logger.warning(
            "umap_model_service.init.deprecated",
            message="ADR-004: UMAPModelService deprecado, usar IntercambiableGroup",
            capabilities=self._capabilities,
            model_dir=str(self.config.MODEL_DIR),
        )

    # -------------------------------------------------------------------------
    # Version Management
    # -------------------------------------------------------------------------

    def get_current_version(self) -> Optional[str]:
        """Get the current model version from filesystem."""
        if self._current_version:
            return self._current_version

        # Find latest model file
        model_files = list(self.config.MODEL_DIR.glob(f"{self.config.MODEL_PREFIX}_v*.pkl"))
        if not model_files:
            return None

        # Extract version numbers and find latest
        versions = []
        for f in model_files:
            try:
                # Format: umap_2d_v1.0.pkl -> v1.0
                version_str = f.stem.replace(f"{self.config.MODEL_PREFIX}_", "")
                versions.append((version_str, f))
            except (ValueError, IndexError):
                continue

        if not versions:
            return None

        # Sort by version (simple string sort works for vX.Y format)
        versions.sort(key=lambda x: x[0], reverse=True)
        self._current_version = versions[0][0]

        return self._current_version

    def generate_next_version(self) -> str:
        """Generate next version number."""
        current = self.get_current_version()
        if not current:
            return "v1.0"

        try:
            # Parse vX.Y format
            major, minor = current[1:].split(".")
            return f"v{major}.{int(minor) + 1}"
        except (ValueError, IndexError):
            return "v1.0"

    def get_model_path(self, version: str) -> Path:
        """Get filesystem path for a model version."""
        return self.config.MODEL_DIR / f"{self.config.MODEL_PREFIX}_{version}.pkl"

    # -------------------------------------------------------------------------
    # Embeddings
    # -------------------------------------------------------------------------

    def get_embedding_model(self):
        """Lazy load embedding model."""
        if not HAS_SENTENCE_TRANSFORMERS:
            raise ImportError("sentence-transformers not installed")

        if self._embedding_model is None:
            logger.info(
                "umap_model_service.loading_embedding_model",
                model=self.config.EMBEDDING_MODEL,
            )
            self._embedding_model = SentenceTransformer(self.config.EMBEDDING_MODEL)

        return self._embedding_model

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for product texts."""
        model = self.get_embedding_model()
        embeddings = model.encode(
            texts,
            show_progress_bar=True,
            batch_size=64,
        )
        return embeddings

    def prepare_product_text(self, product: ProductCatalogVentaLibre) -> str:
        """Prepare product text for embedding."""
        parts = [product.product_name_display]

        if product.detected_brand:
            parts.append(f"MARCA: {product.detected_brand}")

        if product.ml_category:
            parts.append(f"CATEGORIA: {product.ml_category}")

        return " | ".join(parts)

    # -------------------------------------------------------------------------
    # Training
    # -------------------------------------------------------------------------

    def train_model(
        self,
        embeddings: np.ndarray,
        previous_coords: np.ndarray = None,
        previous_embeddings: np.ndarray = None,
    ) -> Tuple[np.ndarray, str]:
        """
        Train UMAP 2D model with optional warm start.

        Args:
            embeddings: Product embeddings (N x 384)
            previous_coords: Previous UMAP coordinates for warm start
            previous_embeddings: Previous embeddings (for new product placement)

        Returns:
            (coordinates, version): 2D coordinates and model version
        """
        if not HAS_UMAP:
            raise ImportError("umap-learn not installed")

        version = self.generate_next_version()

        # Prepare init for warm start
        init = "spectral"
        if (
            self.config.USE_WARM_START
            and previous_coords is not None
            and previous_embeddings is not None
        ):
            init = self._prepare_warm_start_init(
                embeddings, previous_coords, previous_embeddings
            )
            logger.info("umap_model_service.using_warm_start")

        # Train UMAP
        logger.info(
            "umap_model_service.training",
            n_products=len(embeddings),
            version=version,
            warm_start=isinstance(init, np.ndarray),
        )

        reducer = umap.UMAP(
            n_components=self.config.N_COMPONENTS,
            n_neighbors=self.config.N_NEIGHBORS,
            min_dist=self.config.MIN_DIST,
            metric=self.config.METRIC,
            init=init,
            random_state=self.config.RANDOM_STATE,
        )

        coordinates = reducer.fit_transform(embeddings)

        # Save model
        model_path = self.get_model_path(version)
        with open(model_path, "wb") as f:
            pickle.dump(
                {
                    "reducer": reducer,
                    "version": version,
                    "trained_at": datetime.now(timezone.utc).isoformat(),
                    "n_products": len(embeddings),
                    "config": {
                        "n_components": self.config.N_COMPONENTS,
                        "n_neighbors": self.config.N_NEIGHBORS,
                        "min_dist": self.config.MIN_DIST,
                        "metric": self.config.METRIC,
                    },
                },
                f,
            )

        self._umap_model = reducer
        self._current_version = version

        logger.info(
            "umap_model_service.training_complete",
            version=version,
            model_path=str(model_path),
        )

        return coordinates, version

    def _prepare_warm_start_init(
        self,
        new_embeddings: np.ndarray,
        previous_coords: np.ndarray,
        previous_embeddings: np.ndarray,
    ) -> np.ndarray:
        """
        Prepare init coordinates for warm start with new products.

        New products are placed at the position of their nearest neighbor
        to minimize layout disruption.
        """
        n_new = len(new_embeddings)
        n_prev = len(previous_coords)

        if n_new == n_prev:
            return previous_coords

        if not HAS_SKLEARN:
            logger.warning("sklearn not available, using spectral init")
            return "spectral"

        # Initialize with previous coords where available
        init_coords = np.zeros((n_new, 2))

        # For products with previous coords, use them directly
        # For new products, use nearest neighbor position
        for i in range(n_new):
            if i < n_prev:
                init_coords[i] = previous_coords[i]
            else:
                # New product: find nearest neighbor in previous embeddings
                distances = cosine_distances([new_embeddings[i]], previous_embeddings)[0]
                nearest_idx = np.argmin(distances)
                init_coords[i] = previous_coords[nearest_idx]

        return init_coords

    # -------------------------------------------------------------------------
    # Coordinate Retrieval
    # -------------------------------------------------------------------------

    def load_model(self, version: str = None) -> bool:
        """Load a trained UMAP model."""
        if version is None:
            version = self.get_current_version()

        if version is None:
            logger.warning("umap_model_service.no_model_found")
            return False

        model_path = self.get_model_path(version)
        if not model_path.exists():
            logger.warning(
                "umap_model_service.model_not_found",
                path=str(model_path),
            )
            return False

        with open(model_path, "rb") as f:
            data = pickle.load(f)

        self._umap_model = data["reducer"]
        self._current_version = version

        logger.info(
            "umap_model_service.model_loaded",
            version=version,
            trained_at=data.get("trained_at"),
            n_products=data.get("n_products"),
        )

        return True

    def get_products_with_coordinates(
        self,
        db: Session,
        limit: int = 5000,
        necesidad: str = None,
        verified_only: bool = False,
    ) -> List[Dict]:
        """
        Get products with their UMAP coordinates from database.

        OPTIMIZED: Uses column projection instead of full ORM objects
        to reduce memory usage and serialization time (~10x faster).

        Args:
            db: Database session
            limit: Maximum products to return
            necesidad: Filter by NECESIDAD category
            verified_only: Only return human-verified products

        Returns:
            List of product dicts with coordinates
        """
        # Use column projection for performance - only fetch needed columns
        # This avoids loading full ORM objects which is much faster
        P = ProductCatalogVentaLibre
        query = db.query(
            P.id,
            P.product_name_display,
            P.umap_x,
            P.umap_y,
            P.umap_version,
            P.ml_category,
            P.detected_brand,
            P.ml_confidence,
            P.human_verified,
            P.total_sales_count,
        ).filter(
            P.umap_x.isnot(None),
            P.umap_y.isnot(None),
            P.is_active == True,
        )

        if necesidad:
            query = query.filter(P.ml_category == necesidad)

        if verified_only:
            query = query.filter(P.human_verified == True)

        query = query.order_by(P.total_sales_count.desc()).limit(limit)

        # Execute query and convert to dicts efficiently
        products = [
            {
                "id": str(row.id),
                "product_name": row.product_name_display,
                "umap_x": row.umap_x,
                "umap_y": row.umap_y,
                "umap_version": row.umap_version,
                "necesidad": row.ml_category,
                "detected_brand": row.detected_brand,
                "ml_confidence": row.ml_confidence,
                "human_verified": row.human_verified or False,
                "total_sales_count": row.total_sales_count,
            }
            for row in query.all()
        ]

        logger.info(
            "umap_model_service.get_products",
            count=len(products),
            filter_necesidad=necesidad,
            verified_only=verified_only,
        )

        return products

    def update_coordinates(
        self,
        db: Session,
        product_coords: Dict[str, Tuple[float, float]],
        version: str,
    ) -> int:
        """
        Update UMAP coordinates in database.

        Args:
            db: Database session
            product_coords: Dict of {product_id: (umap_x, umap_y)}
            version: Model version

        Returns:
            Number of products updated
        """
        updated = 0

        for product_id, (x, y) in product_coords.items():
            result = db.query(ProductCatalogVentaLibre).filter(
                ProductCatalogVentaLibre.id == product_id
            ).update({
                "umap_x": x,
                "umap_y": y,
                "umap_version": version,
            })
            updated += result

        db.commit()

        logger.info(
            "umap_model_service.coordinates_updated",
            count=updated,
            version=version,
        )

        return updated

    # -------------------------------------------------------------------------
    # Z-Score Calculation (Outlier Detection)
    # -------------------------------------------------------------------------

    def calculate_centroid_distances(
        self,
        products: List[Dict],
        cluster_centroids: Dict[int, Tuple[float, float]] = None,
    ) -> List[Dict]:
        """
        Calculate Z-score of distance to cluster centroid for each product.

        This helps detect "wolves in sheep's clothing" - products with high
        textual similarity but physically distant in embedding space.

        Args:
            products: List of products with umap_x, umap_y
            cluster_centroids: Optional pre-computed centroids

        Returns:
            Products with added centroid_distance_zscore field
        """
        if not products:
            return products

        # If no cluster info, calculate global centroid
        coords = np.array([[p["umap_x"], p["umap_y"]] for p in products])
        global_centroid = np.mean(coords, axis=0)

        # Calculate distances
        distances = np.linalg.norm(coords - global_centroid, axis=1)

        # Calculate Z-scores
        mean_dist = np.mean(distances)
        std_dist = np.std(distances)

        if std_dist == 0:
            z_scores = np.zeros(len(distances))
        else:
            z_scores = (distances - mean_dist) / std_dist

        # Add to products
        for i, p in enumerate(products):
            p["centroid_distance_zscore"] = round(float(z_scores[i]), 3)

        return products

    # -------------------------------------------------------------------------
    # Retrain Check
    # -------------------------------------------------------------------------

    def needs_retrain(self, db: Session) -> Tuple[bool, Dict]:
        """
        Check if model needs retraining based on locked cluster ratio.

        Returns:
            (needs_retrain, stats)
        """
        # Count products with/without coordinates
        total = db.query(func.count(ProductCatalogVentaLibre.id)).filter(
            ProductCatalogVentaLibre.is_active == True
        ).scalar() or 0

        with_coords = db.query(func.count(ProductCatalogVentaLibre.id)).filter(
            ProductCatalogVentaLibre.is_active == True,
            ProductCatalogVentaLibre.umap_x.isnot(None),
        ).scalar() or 0

        verified = db.query(func.count(ProductCatalogVentaLibre.id)).filter(
            ProductCatalogVentaLibre.is_active == True,
            ProductCatalogVentaLibre.human_verified == True,
        ).scalar() or 0

        stats = {
            "total_products": total,
            "products_with_coords": with_coords,
            "verified_products": verified,
            "coverage_percent": round(100 * with_coords / total, 1) if total else 0,
            "verified_ratio": round(verified / total, 3) if total else 0,
        }

        # Check if we need retrain
        # - No model exists
        # - Many products without coordinates
        needs = False
        reason = None

        if with_coords == 0:
            needs = True
            reason = "No products have UMAP coordinates"
        elif with_coords < total * 0.5:
            needs = True
            reason = f"Only {stats['coverage_percent']}% coverage"

        stats["needs_retrain"] = needs
        stats["reason"] = reason

        return needs, stats

    # -------------------------------------------------------------------------
    # Metadata
    # -------------------------------------------------------------------------

    def get_metadata(self, db: Session) -> Dict:
        """
        Get UMAP model and data metadata.

        OPTIMIZED: Uses single query with conditional counts.
        """
        version = self.get_current_version()

        # Single query with conditional counts (much faster than 2 separate queries)
        P = ProductCatalogVentaLibre
        stats = db.query(
            func.count(P.id).label("total"),
            func.count(P.umap_x).label("with_coords"),  # COUNT ignores NULLs
        ).filter(P.is_active == True).first()

        total = stats.total if stats else 0
        with_coords = stats.with_coords if stats else 0

        # Get model info (file I/O is fast, ~1ms)
        last_trained = None
        if version:
            model_path = self.get_model_path(version)
            if model_path.exists():
                try:
                    with open(model_path, "rb") as f:
                        data = pickle.load(f)
                    last_trained = data.get("trained_at")
                except Exception:
                    pass

        return {
            "version": version or "none",
            "total_products": total,
            "products_with_coords": with_coords,
            "last_trained": last_trained,
            "total_clusters": 0,  # TODO: integrate with cluster service
            "locked_clusters": 0,
        }


# =============================================================================
# SINGLETON
# =============================================================================

umap_model_service = UMAPModelService()
