"""
Servicio de Clustering para Productos de Venta Libre
Issue #446 - Fase 3: Grupos Intercambiables

.. deprecated:: ADR-004
    Este servicio está deprecado desde ADR-004 (Dic 2024).
    El clustering dinámico ha sido reemplazado por grupos curados manualmente
    usando `IntercambiableGroup` con `is_curated=True`.

    Para clasificación de productos, usar:
    - `interchangeable_group_service.py` para grupos curados
    - `ventalibre_service.py` para análisis por NECESIDAD

    Este código se mantendrá temporalmente para rollback pero será eliminado
    en una versión futura.

Pipeline (DEPRECADO):
1. Obtener productos de manual_review con NECESIDAD asignada
2. Generar embeddings (sentence-transformers o TF-IDF fallback)
3. Reducir dimensiones (UMAP o PCA fallback)
4. Clustering (HDBSCAN o KMeans fallback)
5. Identificar grupos intercambiables (multi-marca, misma necesidad)
6. Persistir en IntercambiableGroup y actualizar sales_enrichment
"""

import re
import uuid
import warnings
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Set, Tuple

import numpy as np
import structlog
from sqlalchemy import func, text
from sqlalchemy.orm import Session

from app.models.intercambiable_group import IntercambiableGroup
from app.models.sales_data import SalesData
from app.models.sales_enrichment import SalesEnrichment

logger = structlog.get_logger(__name__)

# =============================================================================
# OPTIONAL ML DEPENDENCIES
# =============================================================================

HAS_SENTENCE_TRANSFORMERS = False
HAS_UMAP = False
HAS_HDBSCAN = False

try:
    from sentence_transformers import SentenceTransformer
    HAS_SENTENCE_TRANSFORMERS = True
except ImportError:
    pass

try:
    import umap
    HAS_UMAP = True
except ImportError:
    pass

try:
    import hdbscan
    HAS_HDBSCAN = True
except ImportError:
    pass


# =============================================================================
# CONFIGURATION
# =============================================================================

# Modelo de embeddings multilingüe
EMBEDDING_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'  # Más ligero

# Parámetros UMAP
UMAP_COMPONENTS = 10
UMAP_NEIGHBORS = 15

# Parámetros HDBSCAN
MIN_CLUSTER_SIZE = 3
MIN_SAMPLES = 2

# Parámetros para grupos intercambiables
MIN_BRANDS_FOR_GROUP = 2
NECESIDAD_THRESHOLD = 0.6  # 60% productos con misma necesidad


# =============================================================================
# BRAND DETECTION (lista completa)
# =============================================================================

TODAS_LAS_MARCAS = [
    # Solar/Dermofarmacia
    'isdin', 'heliocare', 'la roche posay', 'laroche', 'avene', 'bioderma',
    'eucerin', 'vichy', 'sesderma', 'martiderm', 'endocare', 'sensilis',
    'svr', 'uriage', 'photoderm', 'anthelios', 'rilastil', 'ladival',
    # Antiedad/Facial
    'medik8', 'filorga', 'caudalie', 'esthederm', 'perricone', 'neostrata',
    'skinceuticals', 'isdinceutics', 'bella aurora', 'nuxe', 'lierac',
    'germinal', 'skeyndor', 'casmara', 'natura bisse', 'thalgo',
    # Marcas locales/especializadas
    'gh ', 'arturo alba', 'dr arthouros', 'nutracel', 'montibello',
    # Dental
    'gum', 'oral-b', 'oral b', 'sensodyne', 'lacer', 'vitis', 'phb', 'kin',
    'fluocaril', 'elmex', 'meridol', 'bexident', 'interprox', 'tepe',
    'parodontax', 'listerine', 'colgate', 'curaprox', 'kukident', 'corega',
    # Capilar
    'ducray', 'klorane', 'rene furterer', 'pilexil', 'lambdapil', 'iraltone',
    'olistic', 'alpecin', 'sebamed', 'vichy dercos',
    # Bebé
    'suavinex', 'mustela', 'chicco', 'nuk', 'avent', 'johnson', 'dodot',
    'blemil', 'nutriben', 'hero baby', 'almiron', 'pedialac', 'nestle',
    'mitosyl', 'nutraisdin',
    # Ocular
    'thealoz', 'hylo', 'optiben', 'systane', 'artelac', 'aquoral', 'blink',
    # Suplementos/Vitaminas
    'solgar', 'aquilea', 'arkopharma', 'nhco', 'pileje', 'meritene', 'ensure',
    'pharmaton', 'supradyn', 'multicentrum', 'ns nutritional', 'ivb',
    'nature bounty', 'solaray', 'lamberts',
    # Digestivo
    'dulcosoft', 'megalevure', 'bivos', 'prodefen', 'lactibiane',
    # Dolor/Muscular
    'fisiocrem', 'arnidol', 'thermacare', 'voltadol', 'radio salil',
    # Respiratorio
    'rhinomer', 'sterimar', 'physiomer', 'respibien', 'utabon', 'rhinospray',
    # Higiene íntima
    'cumlaude', 'lactacyd', 'gynea', 'isdin intim',
    # Otros
    'neutrogena', 'cerave', 'nivea', 'weleda', 'apivita', 'be+', 'farline',
    'acofar', 'compeed', 'urgo', 'tiritas', 'hansaplast',
]

# Normalización de variantes
MARCA_NORMALIZACION = {
    'oral b': 'oral-b',
    'laroche': 'la roche posay',
    'photoderm': 'bioderma',
    'anthelios': 'la roche posay',
    'sensibio': 'bioderma',
    'atoderm': 'bioderma',
    'sebium': 'bioderma',
    'isdinceutics': 'isdin',
    'fotoprotector': 'isdin',
    'nutraisdin': 'isdin',
    'vichy dercos': 'vichy',
    'ns nutritional': 'ns',
}


# Grupos de categorías mutuamente excluyentes
CATEGORIA_GRUPOS = {
    'dental': [
        'encias', 'higiene_dental', 'clorhexidina', 'blanqueamiento_dental',
        'sensibilidad_dental', 'protesis_dental', 'ortodoncia', 'aftas',
        'halitosis', 'boca_seca', 'fluor', 'higiene_interdental',
    ],
    'capilar': [
        'caida_cabello', 'cabello_graso', 'cabello_seco', 'caspa',
        'cuero_cabelludo', 'cabello_tenido', 'volumen_capilar',
    ],
    'solar': [
        'proteccion_solar', 'proteccion_solar_infantil', 'fotoproteccion_oral',
    ],
    'facial': [
        'antiedad', 'antimanchas', 'acne', 'piel_grasa', 'piel_sensible',
        'hidratacion_facial', 'limpieza_facial', 'rosacea', 'poros',
        'contorno_ojos', 'renovacion_celular', 'exfoliacion',
    ],
    'corporal': [
        'hidratacion_corporal', 'piel_atopica', 'cicatrizacion',
        'higiene_corporal', 'cuidado_pies',
    ],
    'bebe': [
        'alimentacion_bebe', 'higiene_bebe', 'dermatitis_panal',
    ],
    'suplementos': [
        'vitaminas_general', 'magnesio', 'probioticos', 'energia',
        'articulaciones', 'defensas', 'digestion',
    ],
}

# Lookup inverso: necesidad -> grupo
NECESIDAD_TO_GRUPO = {}
for grupo, necesidades in CATEGORIA_GRUPOS.items():
    for necesidad in necesidades:
        NECESIDAD_TO_GRUPO[necesidad] = grupo


# =============================================================================
# DATA CLASSES
# =============================================================================

@dataclass
class ProductForClustering:
    """Producto a procesar en clustering."""
    sales_enrichment_id: str
    product_name: str
    necesidad: Optional[str]
    subcategory: Optional[str]
    detected_brand: Optional[str]
    total_amount: float
    sale_count: int


@dataclass
class ClusterInfo:
    """Información de un cluster."""
    cluster_id: int
    products: List[ProductForClustering]
    brands: Set[str] = field(default_factory=set)
    necesidad_principal: Optional[str] = None
    necesidad_ratio: float = 0.0
    total_amount: float = 0.0


@dataclass
class InterchangeableGroupInfo:
    """Grupo intercambiable identificado."""
    necesidad: str
    subcategory: Optional[str]
    brands: List[str]
    products: List[ProductForClustering]
    total_amount: float
    brand_count: int
    product_count: int


@dataclass
class ClusteringResult:
    """Resultado del proceso de clustering."""
    total_products: int
    n_clusters: int
    n_outliers: int
    n_interchangeable_groups: int
    products_in_groups: int
    coverage_percent: float
    groups: List[InterchangeableGroupInfo]
    errors: List[str] = field(default_factory=list)


# =============================================================================
# SERVICE CLASS
# =============================================================================

class ClusteringService:
    """
    Servicio para clustering de productos de venta libre.

    .. deprecated:: ADR-004
        Este servicio está deprecado. Usar grupos curados en su lugar.
        Ver `interchangeable_group_service.py` y `ventalibre_service.py`.
    """

    def __init__(self):
        # ADR-004: Emitir warning de deprecación
        warnings.warn(
            "ClusteringService está deprecado (ADR-004). "
            "El clustering dinámico ha sido reemplazado por grupos curados. "
            "Usar IntercambiableGroup con is_curated=True.",
            DeprecationWarning,
            stacklevel=2
        )
        self._model = None
        self._capabilities = {
            'sentence_transformers': HAS_SENTENCE_TRANSFORMERS,
            'umap': HAS_UMAP,
            'hdbscan': HAS_HDBSCAN,
        }
        logger.warning(
            "clustering_service.deprecated",
            message="ClusteringService deprecado (ADR-004). Usar grupos curados.",
            capabilities=self._capabilities
        )

    # -------------------------------------------------------------------------
    # Data Loading
    # -------------------------------------------------------------------------

    def load_products_for_clustering(
        self,
        session: Session,
        limit: int = 5000,
        necesidad_filter: Optional[str] = None,
    ) -> List[ProductForClustering]:
        """
        Carga productos de manual_review con NECESIDAD asignada.

        Args:
            session: SQLAlchemy session
            limit: Máximo de productos
            necesidad_filter: Filtrar por necesidad específica

        Returns:
            Lista de ProductForClustering
        """
        # Query productos agrupados por nombre (productos únicos)
        query = (
            session.query(
                SalesEnrichment.id.label('enrichment_id'),
                SalesData.product_name,
                SalesEnrichment.ml_category,
                SalesEnrichment.ml_subcategory,
                SalesEnrichment.detected_brand,
                func.sum(SalesData.total_amount).label('total_amount'),
                func.count(SalesData.id).label('sale_count'),
            )
            .join(SalesData, SalesEnrichment.sales_data_id == SalesData.id)
            .filter(SalesEnrichment.enrichment_status == 'manual_review')
            .filter(SalesEnrichment.ml_category.isnot(None))
            .group_by(
                SalesEnrichment.id,
                SalesData.product_name,
                SalesEnrichment.ml_category,
                SalesEnrichment.ml_subcategory,
                SalesEnrichment.detected_brand,
            )
            .order_by(func.sum(SalesData.total_amount).desc())
        )

        if necesidad_filter:
            query = query.filter(SalesEnrichment.ml_category == necesidad_filter)

        query = query.limit(limit)

        products = []
        for row in query.all():
            products.append(ProductForClustering(
                sales_enrichment_id=str(row.enrichment_id),
                product_name=row.product_name,
                necesidad=row.ml_category,
                subcategory=row.ml_subcategory,
                detected_brand=row.detected_brand,
                total_amount=float(row.total_amount) if row.total_amount else 0,
                sale_count=row.sale_count,
            ))

        logger.info(
            "clustering_service.load_products",
            count=len(products),
            filter=necesidad_filter,
        )

        return products

    # -------------------------------------------------------------------------
    # Brand Detection
    # -------------------------------------------------------------------------

    def detect_brand(self, product_name: str) -> Optional[str]:
        """Detecta marca de un producto y la normaliza."""
        name_lower = product_name.lower()
        for marca in sorted(TODAS_LAS_MARCAS, key=len, reverse=True):
            if marca in name_lower:
                marca_clean = marca.strip()
                return MARCA_NORMALIZACION.get(marca_clean, marca_clean)
        return None

    # -------------------------------------------------------------------------
    # Text Preprocessing
    # -------------------------------------------------------------------------

    def preprocess_product_name(self, name: str) -> str:
        """Preprocesa nombre para mejor embedding."""
        # Eliminar cantidades de envase
        cleaned = re.sub(r'\d+\s*envase?s?\b', '', name, flags=re.IGNORECASE)
        cleaned = re.sub(r'\d+\s*tubo?s?\b', '', cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(r'\d+\s*frasco?s?\b', '', cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(r'\b\d+\s*(ml|g|gr|mg)\b(?!\s*spf)', '', cleaned, flags=re.IGNORECASE)
        cleaned = ' '.join(cleaned.split())
        return cleaned.strip()

    def enrich_text_for_embedding(self, product: ProductForClustering) -> str:
        """Enriquece texto de producto para mejor embedding."""
        parts = [self.preprocess_product_name(product.product_name)]

        if product.detected_brand:
            parts.append(f"MARCA: {product.detected_brand}")

        if product.necesidad:
            parts.append(f"NECESIDAD: {product.necesidad}")

        if product.subcategory:
            parts.append(f"ZONA: {product.subcategory}")

        return " | ".join(parts)

    # -------------------------------------------------------------------------
    # Embeddings
    # -------------------------------------------------------------------------

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Genera embeddings semánticos."""
        if HAS_SENTENCE_TRANSFORMERS:
            if self._model is None:
                logger.info("clustering_service.loading_model", model=EMBEDDING_MODEL)
                self._model = SentenceTransformer(EMBEDDING_MODEL)

            embeddings = self._model.encode(
                texts,
                show_progress_bar=True,
                batch_size=64
            )
            return embeddings

        # Fallback: TF-IDF
        logger.warning("clustering_service.using_tfidf_fallback")
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
        return vectorizer.fit_transform(texts).toarray()

    # -------------------------------------------------------------------------
    # Dimensionality Reduction
    # -------------------------------------------------------------------------

    def reduce_dimensions(
        self,
        embeddings: np.ndarray,
        n_components: int = UMAP_COMPONENTS,
    ) -> np.ndarray:
        """Reduce dimensionalidad con UMAP o PCA fallback."""
        if HAS_UMAP:
            logger.info(
                "clustering_service.umap_reduction",
                from_dim=embeddings.shape[1],
                to_dim=n_components,
            )
            reducer = umap.UMAP(
                n_components=n_components,
                n_neighbors=UMAP_NEIGHBORS,
                min_dist=0.1,
                metric='cosine',
                random_state=42
            )
            return reducer.fit_transform(embeddings)

        # Fallback: PCA
        logger.warning("clustering_service.using_pca_fallback")
        from sklearn.decomposition import PCA
        pca = PCA(n_components=min(n_components, embeddings.shape[1]))
        return pca.fit_transform(embeddings)

    # -------------------------------------------------------------------------
    # Clustering
    # -------------------------------------------------------------------------

    def cluster_products(
        self,
        embeddings: np.ndarray,
        min_cluster_size: int = MIN_CLUSTER_SIZE,
    ) -> np.ndarray:
        """Clustering con HDBSCAN o KMeans fallback."""
        if HAS_HDBSCAN:
            logger.info(
                "clustering_service.hdbscan",
                min_cluster_size=min_cluster_size,
            )
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=MIN_SAMPLES,
                metric='euclidean',
                cluster_selection_method='eom'
            )
            labels = clusterer.fit_predict(embeddings)

            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_outliers = (labels == -1).sum()
            logger.info(
                "clustering_service.hdbscan_result",
                n_clusters=n_clusters,
                n_outliers=n_outliers,
            )
            return labels

        # Fallback: KMeans
        logger.warning("clustering_service.using_kmeans_fallback")
        from sklearn.cluster import KMeans
        n_clusters = max(10, len(embeddings) // 20)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        return kmeans.fit_predict(embeddings)

    # -------------------------------------------------------------------------
    # Interchangeable Groups
    # -------------------------------------------------------------------------

    def get_category_group(self, necesidad: str) -> str:
        """Obtiene grupo de categoría de una necesidad."""
        if not necesidad:
            return 'otros'
        return NECESIDAD_TO_GRUPO.get(necesidad.lower(), 'otros')

    def are_categories_compatible(self, necesidades: List[str]) -> bool:
        """Verifica si necesidades son del mismo grupo."""
        grupos = set(self.get_category_group(n) for n in necesidades if n)
        return len(grupos) <= 1

    def find_interchangeable_groups(
        self,
        products: List[ProductForClustering],
        labels: np.ndarray,
        min_brands: int = MIN_BRANDS_FOR_GROUP,
        necesidad_threshold: float = NECESIDAD_THRESHOLD,
    ) -> List[InterchangeableGroupInfo]:
        """
        Identifica grupos intercambiables.

        Criterios:
        1. Múltiples marcas (>= min_brands)
        2. Misma necesidad (>= threshold)
        3. Categorías compatibles
        """
        # Agrupar productos por cluster
        clusters: Dict[int, List[ProductForClustering]] = defaultdict(list)
        for product, label in zip(products, labels):
            if label != -1:  # Excluir outliers
                clusters[label].append(product)

        interchangeable_groups = []

        for cluster_id, cluster_products in clusters.items():
            if len(cluster_products) < 2:
                continue

            # Detectar marcas
            marcas = set()
            necesidad_counts: Counter = Counter()
            todas_necesidades = []

            for p in cluster_products:
                necesidad = p.necesidad
                if necesidad:
                    necesidad_counts[necesidad] += 1
                    todas_necesidades.append(necesidad)

                # Usar marca detectada o detectar
                marca = p.detected_brand or self.detect_brand(p.product_name)
                if marca:
                    marcas.add(marca)

            if not necesidad_counts:
                continue

            # Necesidad mayoritaria
            necesidad_principal, count_principal = necesidad_counts.most_common(1)[0]
            ratio = count_principal / len(cluster_products)

            # Verificar compatibilidad de categorías
            if not self.are_categories_compatible(todas_necesidades):
                # Filtrar por categoría principal
                grupo_principal = self.get_category_group(necesidad_principal)
                cluster_products = [
                    p for p in cluster_products
                    if self.get_category_group(p.necesidad) == grupo_principal
                ]

                # Recalcular marcas
                marcas = set()
                for p in cluster_products:
                    marca = p.detected_brand or self.detect_brand(p.product_name)
                    if marca:
                        marcas.add(marca)

                if len(marcas) < min_brands:
                    continue

                ratio = 1.0

            # Verificar criterios finales
            if len(marcas) >= min_brands and ratio >= necesidad_threshold:
                # Determinar subcategoría mayoritaria
                subcategory_counts: Counter = Counter()
                for p in cluster_products:
                    if p.subcategory:
                        subcategory_counts[p.subcategory] += 1

                subcategory = None
                if subcategory_counts:
                    subcategory = subcategory_counts.most_common(1)[0][0]

                interchangeable_groups.append(InterchangeableGroupInfo(
                    necesidad=necesidad_principal,
                    subcategory=subcategory,
                    brands=sorted(marcas),
                    products=cluster_products,
                    total_amount=sum(p.total_amount for p in cluster_products),
                    brand_count=len(marcas),
                    product_count=len(cluster_products),
                ))

        # Ordenar por importe total
        interchangeable_groups.sort(key=lambda g: g.total_amount, reverse=True)

        logger.info(
            "clustering_service.interchangeable_found",
            n_groups=len(interchangeable_groups),
            total_products=sum(g.product_count for g in interchangeable_groups),
        )

        return interchangeable_groups

    # -------------------------------------------------------------------------
    # Main Pipeline
    # -------------------------------------------------------------------------

    def run_clustering(
        self,
        session: Session,
        limit: int = 5000,
        necesidad_filter: Optional[str] = None,
        min_cluster_size: int = MIN_CLUSTER_SIZE,
        umap_components: int = UMAP_COMPONENTS,
    ) -> ClusteringResult:
        """
        Ejecuta pipeline completo de clustering.

        Args:
            session: SQLAlchemy session
            limit: Máximo de productos
            necesidad_filter: Filtrar por necesidad
            min_cluster_size: Tamaño mínimo cluster HDBSCAN
            umap_components: Dimensiones UMAP

        Returns:
            ClusteringResult con grupos identificados
        """
        errors = []

        # 1. Cargar productos
        logger.info("clustering_service.step1_loading")
        products = self.load_products_for_clustering(
            session,
            limit=limit,
            necesidad_filter=necesidad_filter,
        )

        if len(products) < 10:
            return ClusteringResult(
                total_products=len(products),
                n_clusters=0,
                n_outliers=0,
                n_interchangeable_groups=0,
                products_in_groups=0,
                coverage_percent=0.0,
                groups=[],
                errors=["Insufficient products for clustering (< 10)"],
            )

        # 2. Generar textos enriquecidos
        logger.info("clustering_service.step2_enriching")
        texts = [self.enrich_text_for_embedding(p) for p in products]

        # 3. Generar embeddings
        logger.info("clustering_service.step3_embeddings")
        try:
            embeddings = self.generate_embeddings(texts)
        except Exception as e:
            logger.error("clustering_service.embeddings_error", error=str(e))
            errors.append(f"Embeddings error: {str(e)}")
            return ClusteringResult(
                total_products=len(products),
                n_clusters=0,
                n_outliers=0,
                n_interchangeable_groups=0,
                products_in_groups=0,
                coverage_percent=0.0,
                groups=[],
                errors=errors,
            )

        # 4. Reducir dimensiones
        logger.info("clustering_service.step4_umap")
        try:
            reduced = self.reduce_dimensions(embeddings, n_components=umap_components)
        except Exception as e:
            logger.error("clustering_service.umap_error", error=str(e))
            reduced = embeddings  # Usar embeddings originales

        # 5. Clustering
        logger.info("clustering_service.step5_clustering")
        labels = self.cluster_products(reduced, min_cluster_size=min_cluster_size)

        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_outliers = (labels == -1).sum()

        # 6. Identificar grupos intercambiables
        logger.info("clustering_service.step6_interchangeable")
        groups = self.find_interchangeable_groups(products, labels)

        products_in_groups = sum(g.product_count for g in groups)
        coverage = (products_in_groups / len(products) * 100) if products else 0

        return ClusteringResult(
            total_products=len(products),
            n_clusters=n_clusters,
            n_outliers=n_outliers,
            n_interchangeable_groups=len(groups),
            products_in_groups=products_in_groups,
            coverage_percent=coverage,
            groups=groups,
            errors=errors,
        )

    # -------------------------------------------------------------------------
    # Persistence
    # -------------------------------------------------------------------------

    def persist_groups(
        self,
        session: Session,
        result: ClusteringResult,
        dry_run: bool = False,
    ) -> Dict[str, Any]:
        """
        Persiste grupos en IntercambiableGroup y actualiza sales_enrichment.

        Args:
            session: SQLAlchemy session
            result: Resultado del clustering
            dry_run: Solo simular, no persistir

        Returns:
            Estadísticas de persistencia
        """
        stats = {
            'groups_created': 0,
            'enrichments_updated': 0,
            'dry_run': dry_run,
        }

        if not result.groups:
            return stats

        for group_info in result.groups:
            # Generar slug único
            slug = self._generate_slug(group_info)

            # Verificar si ya existe
            existing = session.query(IntercambiableGroup).filter_by(slug=slug).first()

            if existing:
                group = existing
                logger.info(
                    "clustering_service.group_exists",
                    slug=slug,
                    group_id=str(group.id),
                )
            else:
                if dry_run:
                    logger.info(
                        "clustering_service.would_create_group",
                        slug=slug,
                        necesidad=group_info.necesidad,
                        brands=group_info.brands,
                    )
                    continue

                # Crear nuevo grupo
                group = IntercambiableGroup(
                    id=uuid.uuid4(),
                    name=self._generate_group_name(group_info),
                    slug=slug,
                    description=self._generate_description(group_info),
                    necesidad=group_info.necesidad,
                    subcategory=group_info.subcategory,
                    min_brands=MIN_BRANDS_FOR_GROUP,
                    min_products=MIN_CLUSTER_SIZE,
                    clustering_method='semantic',
                    validated=False,
                    product_count=group_info.product_count,
                    brand_count=group_info.brand_count,
                    total_sales_amount=group_info.total_amount,
                    total_sales_count=sum(p.sale_count for p in group_info.products),
                )
                session.add(group)
                stats['groups_created'] += 1

                logger.info(
                    "clustering_service.group_created",
                    slug=slug,
                    group_id=str(group.id),
                    brands=group_info.brands,
                )

            # Actualizar sales_enrichment
            if not dry_run:
                for product in group_info.products:
                    enrichment = session.query(SalesEnrichment).get(
                        uuid.UUID(product.sales_enrichment_id)
                    )
                    if enrichment:
                        enrichment.intercambiable_group_id = group.id
                        stats['enrichments_updated'] += 1

        if not dry_run:
            session.commit()

        logger.info("clustering_service.persist_complete", stats=stats)
        return stats

    def _generate_slug(self, group: InterchangeableGroupInfo) -> str:
        """Genera slug único para grupo."""
        base = f"{group.necesidad}"
        if group.subcategory:
            base += f"-{group.subcategory}"
        # Añadir hash de marcas para unicidad
        brands_hash = hash(tuple(sorted(group.brands))) % 10000
        return f"{base}-{brands_hash}".lower().replace(' ', '_')

    def _generate_group_name(self, group: InterchangeableGroupInfo) -> str:
        """Genera nombre descriptivo."""
        necesidad_formatted = group.necesidad.replace('_', ' ').title()
        brands_str = ', '.join(group.brands[:3])
        if len(group.brands) > 3:
            brands_str += f' +{len(group.brands) - 3}'
        return f"{necesidad_formatted} ({brands_str})"

    def _generate_description(self, group: InterchangeableGroupInfo) -> str:
        """Genera descripción del grupo."""
        return (
            f"Grupo de {group.product_count} productos intercambiables "
            f"de {group.brand_count} marcas ({', '.join(group.brands)}) "
            f"para {group.necesidad.replace('_', ' ')}. "
            f"Importe total: {group.total_amount:,.2f}€"
        )


# =============================================================================
# SINGLETON
# =============================================================================

clustering_service = ClusteringService()
