# backend/app/services/keyword_override_service.py
"""
Service for keyword override management.

Issue #449: Dynamic keyword management for NECESIDAD classifier.

Este servicio gestiona los overrides de keywords que se almacenan en BD
y tienen prioridad sobre los keywords hardcodeados en el código Python.

Arquitectura de prioridad:
    1. DB Blacklist (keyword_type='blacklist') - Máxima prioridad
    2. DB Brands (keyword_type='brand') - Alta prioridad
    3. DB Keywords (keyword_type='keyword')
    4. Código MARCA_NECESIDAD - Fallback brands
    5. Código NECESIDAD_KEYWORDS - Fallback keywords
"""

import logging
from datetime import datetime, timezone
from typing import Optional
from uuid import UUID

from sqlalchemy import func, or_
from sqlalchemy.orm import Session

from app.models.keyword_override import KeywordOverride, KeywordType
from app.models.product_catalog_venta_libre import ProductCatalogVentaLibre
from app.schemas.keyword_override import (
    KeywordBulkImportItem,
    KeywordBulkImportRequest,
    KeywordOverrideCreate,
    KeywordOverrideUpdate,
    KeywordPreviewRequest,
)

logger = logging.getLogger(__name__)


class KeywordOverrideService:
    """
    Service para CRUD de keyword overrides.

    Integra con SandwichClassifierService para que los cambios
    tomen efecto inmediatamente (cache invalidation).
    """

    def __init__(self, db: Session):
        self.db = db

    # =========================================================================
    # CRUD Operations
    # =========================================================================

    def list_keywords(
        self,
        keyword_type: Optional[str] = None,
        category: Optional[str] = None,
        is_active: Optional[bool] = None,
        search: Optional[str] = None,
        limit: int = 50,
        offset: int = 0,
    ) -> dict:
        """
        Lista keywords con filtros y estadísticas.

        Args:
            keyword_type: Filtrar por tipo (keyword, brand, blacklist)
            category: Filtrar por categoría NECESIDAD
            is_active: Filtrar por estado activo/inactivo
            search: Buscar en texto del keyword
            limit: Máximo de resultados
            offset: Paginación

        Returns:
            dict con items, total, offset, limit, stats
        """
        query = self.db.query(KeywordOverride)

        # Aplicar filtros
        if keyword_type:
            query = query.filter(KeywordOverride.keyword_type == keyword_type.lower())
        if category:
            query = query.filter(KeywordOverride.category == category)
        if is_active is not None:
            query = query.filter(KeywordOverride.is_active == is_active)
        if search:
            # Escape SQL LIKE wildcards to prevent wildcard injection
            escaped_search = search.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
            query = query.filter(KeywordOverride.keyword.ilike(f"%{escaped_search}%", escape="\\"))

        # Contar total
        total = query.count()

        # Ordenar: tipo, prioridad desc, keyword
        items = (
            query.order_by(
                KeywordOverride.keyword_type,
                KeywordOverride.priority.desc(),
                KeywordOverride.keyword,
            )
            .offset(offset)
            .limit(limit)
            .all()
        )

        # Calcular estadísticas
        stats = self._get_stats()

        # Convertir a response format
        items_response = []
        for item in items:
            item_dict = {
                "id": item.id,
                "keyword": item.keyword,
                "category": item.category,
                "keyword_type": item.keyword_type,
                "priority": item.priority,
                "is_active": item.is_active,
                "created_by": item.created_by,
                "created_at": item.created_at,
                "updated_at": item.updated_at,
                "source": item.source,
                "notes": item.notes,
                "creator_email": item.creator.email if item.creator else None,
            }
            items_response.append(item_dict)

        return {
            "items": items_response,
            "total": total,
            "offset": offset,
            "limit": limit,
            "stats": stats,
        }

    def create_keyword(
        self, data: KeywordOverrideCreate, created_by: UUID
    ) -> KeywordOverride:
        """
        Crea un nuevo keyword override.

        Args:
            data: Datos del keyword
            created_by: UUID del usuario que crea

        Returns:
            KeywordOverride creado

        Raises:
            ValueError: Si el keyword ya existe para ese tipo
        """
        # Normalizar keyword
        keyword = data.keyword.lower().strip()

        # Verificar duplicados
        existing = (
            self.db.query(KeywordOverride)
            .filter(
                KeywordOverride.keyword == keyword,
                KeywordOverride.keyword_type == data.keyword_type,
            )
            .first()
        )

        if existing:
            raise ValueError(
                f"Keyword '{keyword}' ya existe para tipo '{data.keyword_type}'"
            )

        # Crear override
        override = KeywordOverride(
            keyword=keyword,
            category=data.category,
            keyword_type=data.keyword_type,
            priority=data.priority,
            is_active=data.is_active,
            notes=data.notes,
            created_by=created_by,
            source="manual",
        )

        self.db.add(override)
        self.db.commit()
        self.db.refresh(override)

        # Invalidar cache del clasificador
        self._invalidate_classifier_cache()

        logger.info(f"Created keyword override: {keyword} -> {data.category}")
        return override

    def update_keyword(
        self, keyword_id: int, data: KeywordOverrideUpdate
    ) -> KeywordOverride:
        """
        Actualiza un keyword override existente.

        Args:
            keyword_id: ID del keyword a actualizar
            data: Datos a actualizar

        Returns:
            KeywordOverride actualizado

        Raises:
            ValueError: Si no existe el keyword
        """
        override = (
            self.db.query(KeywordOverride)
            .filter(KeywordOverride.id == keyword_id)
            .first()
        )

        if not override:
            raise ValueError(f"Keyword override {keyword_id} no encontrado")

        # Actualizar campos proporcionados
        update_data = data.model_dump(exclude_unset=True)
        for field, value in update_data.items():
            if field == "keyword" and value:
                value = value.lower().strip()
            setattr(override, field, value)

        self.db.commit()
        self.db.refresh(override)

        # Invalidar cache del clasificador
        self._invalidate_classifier_cache()

        logger.info(f"Updated keyword override {keyword_id}")
        return override

    def delete_keyword(self, keyword_id: int, hard_delete: bool = False) -> dict:
        """
        Elimina o desactiva un keyword override.

        Args:
            keyword_id: ID del keyword
            hard_delete: Si True, elimina permanentemente

        Returns:
            dict con mensaje y id

        Raises:
            ValueError: Si no existe el keyword
        """
        override = (
            self.db.query(KeywordOverride)
            .filter(KeywordOverride.id == keyword_id)
            .first()
        )

        if not override:
            raise ValueError(f"Keyword override {keyword_id} no encontrado")

        if hard_delete:
            self.db.delete(override)
            self.db.commit()
            message = f"Eliminado permanentemente keyword override {keyword_id}"
        else:
            override.is_active = False
            self.db.commit()
            message = f"Desactivado keyword override {keyword_id}"

        # Invalidar cache del clasificador
        self._invalidate_classifier_cache()

        logger.info(message)
        return {"message": message, "id": keyword_id}

    # =========================================================================
    # Preview & Analysis
    # =========================================================================

    def preview_impact(self, request: KeywordPreviewRequest) -> dict:
        """
        Preview de productos que serían reclasificados con este keyword.

        Busca productos en ProductCatalogVentaLibre cuyo nombre contiene
        el keyword y tienen categoría diferente a la nueva.

        Args:
            request: Datos del preview (keyword, category, limit)

        Returns:
            dict con affected_count y sample_products
        """
        keyword = request.keyword.lower().strip()

        # Buscar productos que matchean el keyword y tienen categoría diferente
        query = self.db.query(ProductCatalogVentaLibre).filter(
            func.lower(ProductCatalogVentaLibre.product_name_normalized).contains(
                keyword
            ),
            or_(
                ProductCatalogVentaLibre.ml_category != request.category,
                ProductCatalogVentaLibre.ml_category.is_(None),
            ),
        )

        # Contar total
        total_affected = query.count()

        # Obtener muestra
        products = query.limit(request.limit).all()

        sample_products = []
        for p in products:
            sample_products.append(
                {
                    "product_name": (
                        p.product_name_display[:100]
                        if p.product_name_display
                        else "Sin nombre"
                    ),
                    "current_category": p.ml_category,
                    "new_category": request.category,
                    "confidence_change": 0.95 - (p.ml_confidence or 0),
                }
            )

        return {
            "keyword": keyword,
            "new_category": request.category,
            "affected_count": total_affected,
            "sample_products": sample_products,
        }

    def apply_keywords_to_products(
        self,
        dry_run: bool = True,
        keyword_ids: Optional[list[int]] = None,
    ) -> dict:
        """
        Apply keyword overrides to existing products (Issue #449 Phase 3).

        Scans all products in ProductCatalogVentaLibre and applies matching
        keyword overrides to update their ml_category.

        Args:
            dry_run: If True, only preview changes without applying
            keyword_ids: Specific keyword IDs to apply. If None, applies all active.

        Returns:
            dict with total_scanned, products_reclassified, sample_changes
        """
        # Get keywords to apply
        query = self.db.query(KeywordOverride).filter(KeywordOverride.is_active == True)

        if keyword_ids:
            query = query.filter(KeywordOverride.id.in_(keyword_ids))

        # Order by priority (highest first) and type (blacklist > brand > keyword)
        type_order = {"blacklist": 0, "brand": 1, "keyword": 2}
        keywords = query.order_by(KeywordOverride.priority.desc()).all()
        keywords = sorted(keywords, key=lambda k: (type_order.get(k.keyword_type, 3), -k.priority))

        if not keywords:
            return {
                "dry_run": dry_run,
                "total_products_scanned": 0,
                "products_reclassified": 0,
                "sample_changes": [],
                "message": "No hay keywords activos para aplicar",
            }

        # Process products in chunks to avoid memory issues on Render (512MB limit)
        # For catalogs with 67k+ products, loading all at once exceeds memory
        BATCH_SIZE = 1000
        offset = 0
        total_scanned = 0
        changes = []

        base_query = (
            self.db.query(ProductCatalogVentaLibre)
            .filter(ProductCatalogVentaLibre.human_verified == False)
            .order_by(ProductCatalogVentaLibre.id)  # Consistent ordering for pagination
        )

        while True:
            # Fetch batch
            batch = base_query.offset(offset).limit(BATCH_SIZE).all()

            if not batch:
                break

            total_scanned += len(batch)

            for product in batch:
                if not product.product_name_normalized:
                    continue

                name_lower = product.product_name_normalized.lower()

                # Try to match each keyword in priority order
                for kw in keywords:
                    if kw.keyword in name_lower:
                        # Check if category would change
                        if product.ml_category != kw.category:
                            changes.append({
                                "product_id": product.id,
                                "product_name": (
                                    product.product_name_display[:80]
                                    if product.product_name_display
                                    else "Sin nombre"
                                ),
                                "old_category": product.ml_category,
                                "new_category": kw.category,
                                "matched_keyword": kw.keyword,
                                "keyword_type": kw.keyword_type,
                            })

                            # Apply change if not dry run
                            if not dry_run:
                                product.ml_category = kw.category
                                product.ml_confidence = 0.95  # High confidence for keyword match
                                product.prediction_source = "keyword_override"

                        # Stop checking keywords for this product (first match wins)
                        break

            # Commit batch if not dry run (reduces memory pressure)
            if not dry_run and changes:
                self.db.flush()

            # Clear session cache to free memory after processing batch
            self.db.expire_all()

            offset += BATCH_SIZE

            # Log progress for large catalogs
            if total_scanned % 10000 == 0:
                logger.info(f"Keyword apply progress: {total_scanned} products scanned, {len(changes)} changes")

        # Final commit if not dry run
        if not dry_run and changes:
            self.db.commit()
            # Invalidate cache
            self._invalidate_classifier_cache()
            logger.info(f"Applied keywords: {len(changes)} products reclassified")

        # Prepare response
        action = "serían reclasificados" if dry_run else "fueron reclasificados"
        message = f"{len(changes)} productos {action} de {total_scanned} escaneados"

        return {
            "dry_run": dry_run,
            "total_products_scanned": total_scanned,
            "products_reclassified": len(changes),
            "sample_changes": changes[:50],  # Limit sample to 50
            "message": message,
        }

    # =========================================================================
    # Bulk Operations
    # =========================================================================

    def import_keywords(
        self, request: KeywordBulkImportRequest, created_by: UUID
    ) -> dict:
        """
        Importa keywords en bulk.

        Args:
            request: Lista de keywords a importar
            created_by: UUID del usuario

        Returns:
            dict con created, updated, skipped, errors
        """
        created = 0
        updated = 0
        skipped = 0
        errors = []

        for item in request.items:
            try:
                keyword = item.keyword.lower().strip()

                # Buscar existente
                existing = (
                    self.db.query(KeywordOverride)
                    .filter(
                        KeywordOverride.keyword == keyword,
                        KeywordOverride.keyword_type == item.keyword_type,
                    )
                    .first()
                )

                if existing:
                    if request.overwrite_existing:
                        existing.category = item.category
                        existing.priority = item.priority
                        existing.updated_at = datetime.now(timezone.utc)
                        updated += 1
                    else:
                        skipped += 1
                else:
                    new_override = KeywordOverride(
                        keyword=keyword,
                        category=item.category,
                        keyword_type=item.keyword_type,
                        priority=item.priority,
                        is_active=True,
                        created_by=created_by,
                        source="imported",
                    )
                    self.db.add(new_override)
                    created += 1

            except Exception as e:
                errors.append({"keyword": item.keyword, "error": str(e)})

        self.db.commit()

        # Invalidar cache
        self._invalidate_classifier_cache()

        logger.info(
            f"Bulk import: created={created}, updated={updated}, "
            f"skipped={skipped}, errors={len(errors)}"
        )

        return {
            "created": created,
            "updated": updated,
            "skipped": skipped,
            "errors": errors,
        }

    def export_keywords(
        self, format: str = "json", include_inactive: bool = False
    ) -> list:
        """
        Exporta todos los keywords.

        Args:
            format: Formato de salida (json o csv)
            include_inactive: Incluir keywords inactivos

        Returns:
            Lista de keywords para export
        """
        query = self.db.query(KeywordOverride)

        if not include_inactive:
            query = query.filter(KeywordOverride.is_active == True)

        keywords = query.order_by(
            KeywordOverride.keyword_type, KeywordOverride.keyword
        ).all()

        result = []
        for kw in keywords:
            result.append(
                {
                    "keyword": kw.keyword,
                    "category": kw.category,
                    "keyword_type": kw.keyword_type,
                    "priority": kw.priority,
                    "is_active": kw.is_active,
                    "notes": kw.notes,
                }
            )

        return result

    # =========================================================================
    # Classifier Integration
    # =========================================================================

    def get_active_overrides(self) -> dict[str, dict[str, str]]:
        """
        Obtiene todos los overrides activos organizados para el clasificador.

        Returns:
            {
                "brands": {"olistic": "caida_cabello", ...},
                "keywords": {"anticaida": "caida_cabello", ...},
                "blacklist": {"regalo": "interno_no_venta", ...}
            }
        """
        overrides = (
            self.db.query(KeywordOverride)
            .filter(KeywordOverride.is_active == True)
            .order_by(KeywordOverride.priority.desc())
            .all()
        )

        result = {"brands": {}, "keywords": {}, "blacklist": {}}

        for o in overrides:
            if o.keyword_type == KeywordType.BRAND.value:
                result["brands"][o.keyword] = o.category
            elif o.keyword_type == KeywordType.KEYWORD.value:
                result["keywords"][o.keyword] = o.category
            elif o.keyword_type == KeywordType.BLACKLIST.value:
                result["blacklist"][o.keyword] = o.category

        return result

    # =========================================================================
    # Private Methods
    # =========================================================================

    def _get_stats(self) -> dict:
        """Calcula estadísticas de los keyword overrides."""
        # Por tipo
        by_type = (
            self.db.query(
                KeywordOverride.keyword_type, func.count(KeywordOverride.id)
            )
            .filter(KeywordOverride.is_active == True)
            .group_by(KeywordOverride.keyword_type)
            .all()
        )

        # Por categoría (top 10)
        by_category = (
            self.db.query(KeywordOverride.category, func.count(KeywordOverride.id))
            .filter(KeywordOverride.is_active == True)
            .group_by(KeywordOverride.category)
            .order_by(func.count(KeywordOverride.id).desc())
            .limit(10)
            .all()
        )

        return {
            "by_type": {str(t): c for t, c in by_type},
            "by_category": {c: n for c, n in by_category},
        }

    def _invalidate_classifier_cache(self):
        """Invalida cache del clasificador para forzar recarga de overrides."""
        try:
            from app.services.sandwich_classifier_service import get_sandwich_classifier

            classifier = get_sandwich_classifier()
            if hasattr(classifier, "_db_overrides"):
                classifier._db_overrides = None
            if hasattr(classifier, "_cache"):
                classifier._cache.clear()
            logger.info("Invalidated classifier cache after keyword change")
        except Exception as e:
            # No fallar si el clasificador no está disponible
            logger.warning(f"Could not invalidate classifier cache: {e}")
