"""Add LLM enrichment pipeline and ProductCluster (Issue #456)

Revision ID: 20251218_01_llm_pipeline
Revises: 20251216_02_brand_catalog
Create Date: 2025-12-18

Issue #456: Pipeline de Clasificación Venta Libre (M1-M2)

Cambios:
1. Nueva tabla: product_cluster (clusters jerárquicos con sistema de anclas)
2. Nuevo enum: cluster_state_enum (provisional, locked, archived)
3. Nuevos campos en sales_enrichment:
   - product_cluster_id (FK a cluster para clasificación)
   - llm_indicaciones, llm_composicion, llm_modo_empleo (datos LLM)
   - llm_confidence, llm_enriched_at, llm_model_version (metadata LLM)

ADR: Modelo ProductCluster con state machine para validación farmacéutica
"""

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql


# revision identifiers, used by Alembic.
revision = "20251218_01_llm_pipeline"
down_revision = "20251216_02_brand_catalog"
branch_labels = None
depends_on = None


def upgrade() -> None:
    """
    Migración idempotente: verifica existencia antes de crear.
    Siguiendo REGLA #14 de CLAUDE.md.
    """
    conn = op.get_bind()

    # === 1. CREAR ENUM cluster_state_enum ===
    enum_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM pg_type
                WHERE typname = 'cluster_state_enum'
            )
        """)
    ).scalar()

    if not enum_exists:
        conn.execute(sa.text(
            "CREATE TYPE cluster_state_enum AS ENUM ('provisional', 'locked', 'archived')"
        ))

    # === 2. CREAR TABLA product_cluster ===
    table_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_schema = 'public'
                AND table_name = 'product_cluster'
            )
        """)
    ).scalar()

    if not table_exists:
        # Crear enum para usar en la tabla
        cluster_state = postgresql.ENUM(
            'provisional', 'locked', 'archived',
            name='cluster_state_enum',
            create_type=False
        )

        op.create_table(
            'product_cluster',
            # Clave primaria
            sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),

            # Identificación
            sa.Column('name', sa.String(200), nullable=False,
                      comment="Nombre descriptivo del cluster"),
            sa.Column('slug', sa.String(100), nullable=False,
                      comment="Identificador URL-friendly único"),
            sa.Column('description', sa.Text(), nullable=True,
                      comment="Descripción del cluster"),

            # Jerarquía
            sa.Column('hierarchy_level', sa.Integer(), nullable=False,
                      comment="Nivel: 0 (10), 1 (30), 2 (100), 3 (300) clusters"),
            sa.Column('parent_cluster_id', postgresql.UUID(as_uuid=True), nullable=True,
                      comment="Cluster padre en nivel superior"),

            # Centroide (anchor embedding)
            sa.Column('centroid_embedding', postgresql.ARRAY(sa.Float()), nullable=True,
                      comment="Embedding 384-dim del centroide"),
            sa.Column('centroid_product_id', postgresql.UUID(as_uuid=True), nullable=True,
                      comment="ID del producto representante"),
            sa.Column('centroid_product_name', sa.String(300), nullable=True,
                      comment="Nombre del producto representante"),

            # Clasificación
            sa.Column('primary_necesidad', sa.String(100), nullable=True,
                      comment="NECESIDAD principal del cluster"),
            sa.Column('primary_subcategory', sa.String(100), nullable=True,
                      comment="Subcategoría principal"),
            sa.Column('top_brands', postgresql.ARRAY(sa.String(100)), nullable=True,
                      comment="Top 5 marcas más frecuentes"),

            # Nombre LLM
            sa.Column('llm_generated_name', sa.String(200), nullable=True,
                      comment="Nombre generado por LLM"),
            sa.Column('llm_name_confidence', sa.Numeric(3, 2), nullable=True,
                      comment="Confianza del LLM en el nombre (0.00-1.00)"),

            # State machine
            sa.Column('state', cluster_state, nullable=False,
                      server_default='provisional',
                      comment="Estado: provisional, locked, archived"),

            # Umbral de asignación
            sa.Column('assignment_threshold', sa.Numeric(3, 2), nullable=False,
                      server_default='0.85',
                      comment="Umbral mínimo de similitud (0.00-1.00)"),

            # Validación
            sa.Column('validated_by', sa.String(100), nullable=True,
                      comment="Email del farmacéutico que validó"),
            sa.Column('validated_at', sa.DateTime(timezone=True), nullable=True,
                      comment="Fecha de validación"),
            sa.Column('validation_notes', sa.Text(), nullable=True,
                      comment="Notas de validación"),

            # Métricas de calidad
            sa.Column('silhouette_score', sa.Numeric(5, 4), nullable=True,
                      comment="Silhouette score (-1 a 1)"),
            sa.Column('purity_score', sa.Numeric(5, 4), nullable=True,
                      comment="Purity score (0-1)"),
            sa.Column('cohesion_score', sa.Numeric(5, 4), nullable=True,
                      comment="Cohesión interna"),

            # Estadísticas denormalizadas
            sa.Column('product_count', sa.Integer(), nullable=False, server_default='0',
                      comment="Número de productos"),
            sa.Column('brand_count', sa.Integer(), nullable=False, server_default='0',
                      comment="Número de marcas"),
            sa.Column('total_sales_amount', sa.Numeric(12, 2), nullable=False,
                      server_default='0', comment="Suma total de ventas"),

            # Timestamps
            sa.Column('created_at', sa.DateTime(timezone=True),
                      server_default=sa.text('now()'), nullable=False),
            sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True),

            # Constraints
            sa.PrimaryKeyConstraint('id'),
            sa.UniqueConstraint('slug', name='uq_product_cluster_slug'),
            sa.ForeignKeyConstraint(
                ['parent_cluster_id'], ['product_cluster.id'],
                name='fk_product_cluster_parent',
                ondelete='SET NULL'
            ),
        )

        # Índices
        op.create_index('ix_product_cluster_slug', 'product_cluster', ['slug'])
        op.create_index('ix_product_cluster_hierarchy_level', 'product_cluster', ['hierarchy_level'])
        op.create_index('ix_product_cluster_state', 'product_cluster', ['state'])
        op.create_index('ix_product_cluster_necesidad', 'product_cluster', ['primary_necesidad'])
        op.create_index('ix_product_cluster_state_level', 'product_cluster', ['state', 'hierarchy_level'])
        op.create_index('ix_product_cluster_parent', 'product_cluster', ['parent_cluster_id'])

    # === 3. AÑADIR CAMPOS LLM A sales_enrichment ===

    # 3.1 product_cluster_id
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'product_cluster_id'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'product_cluster_id',
            postgresql.UUID(as_uuid=True),
            nullable=True,
            comment="FK a cluster jerárquico para clasificación"
        ))
        op.create_index(
            'ix_sales_enrichment_product_cluster_id',
            'sales_enrichment',
            ['product_cluster_id']
        )
        op.create_foreign_key(
            'fk_sales_enrichment_product_cluster',
            'sales_enrichment',
            'product_cluster',
            ['product_cluster_id'],
            ['id'],
            ondelete='SET NULL'
        )

    # 3.2 llm_indicaciones
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_indicaciones'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_indicaciones', sa.Text(), nullable=True,
            comment="Indicaciones extraídas por LLM"
        ))

    # 3.3 llm_composicion
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_composicion'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_composicion', sa.Text(), nullable=True,
            comment="Composición principal extraída por LLM"
        ))

    # 3.4 llm_modo_empleo
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_modo_empleo'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_modo_empleo', sa.Text(), nullable=True,
            comment="Modo de empleo extraído por LLM"
        ))

    # 3.5 llm_confidence
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_confidence'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_confidence', sa.Numeric(3, 2), nullable=True,
            comment="Confianza LLM (0.00-1.00)"
        ))

    # 3.6 llm_enriched_at
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_enriched_at'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_enriched_at', sa.DateTime(timezone=True), nullable=True,
            comment="Timestamp de enriquecimiento LLM"
        ))

    # 3.7 llm_model_version
    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_schema = 'public'
                AND table_name = 'sales_enrichment'
                AND column_name = 'llm_model_version'
            )
        """)
    ).scalar()

    if not column_exists:
        op.add_column('sales_enrichment', sa.Column(
            'llm_model_version', sa.String(50), nullable=True,
            comment="Versión del modelo LLM usado"
        ))

    # === 4. ÍNDICE COMPUESTO PARA LLM ===
    index_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM pg_indexes
                WHERE schemaname = 'public'
                AND tablename = 'sales_enrichment'
                AND indexname = 'ix_sales_enrichment_llm_status'
            )
        """)
    ).scalar()

    if not index_exists:
        op.create_index(
            'ix_sales_enrichment_llm_status',
            'sales_enrichment',
            ['llm_enriched_at', 'llm_confidence']
        )


def downgrade() -> None:
    """
    Rollback: eliminar campos y tabla en orden inverso.
    """
    conn = op.get_bind()

    # === 1. ELIMINAR ÍNDICE COMPUESTO LLM ===
    index_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM pg_indexes
                WHERE schemaname = 'public'
                AND tablename = 'sales_enrichment'
                AND indexname = 'ix_sales_enrichment_llm_status'
            )
        """)
    ).scalar()

    if index_exists:
        op.drop_index('ix_sales_enrichment_llm_status', table_name='sales_enrichment')

    # === 2. ELIMINAR CAMPOS LLM DE sales_enrichment ===
    llm_columns = [
        'llm_model_version',
        'llm_enriched_at',
        'llm_confidence',
        'llm_modo_empleo',
        'llm_composicion',
        'llm_indicaciones',
    ]

    for col_name in llm_columns:
        column_exists = conn.execute(
            sa.text(f"""
                SELECT EXISTS (
                    SELECT FROM information_schema.columns
                    WHERE table_name = 'sales_enrichment'
                    AND column_name = '{col_name}'
                )
            """)
        ).scalar()

        if column_exists:
            op.drop_column('sales_enrichment', col_name)

    # === 3. ELIMINAR FK y columna product_cluster_id ===
    fk_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.table_constraints
                WHERE constraint_name = 'fk_sales_enrichment_product_cluster'
                AND table_name = 'sales_enrichment'
            )
        """)
    ).scalar()

    if fk_exists:
        op.drop_constraint('fk_sales_enrichment_product_cluster', 'sales_enrichment', type_='foreignkey')

    index_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM pg_indexes
                WHERE indexname = 'ix_sales_enrichment_product_cluster_id'
            )
        """)
    ).scalar()

    if index_exists:
        op.drop_index('ix_sales_enrichment_product_cluster_id', table_name='sales_enrichment')

    column_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.columns
                WHERE table_name = 'sales_enrichment'
                AND column_name = 'product_cluster_id'
            )
        """)
    ).scalar()

    if column_exists:
        op.drop_column('sales_enrichment', 'product_cluster_id')

    # === 4. ELIMINAR TABLA product_cluster ===
    table_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'product_cluster'
            )
        """)
    ).scalar()

    if table_exists:
        # Eliminar índices primero
        indexes = [
            'ix_product_cluster_parent',
            'ix_product_cluster_state_level',
            'ix_product_cluster_necesidad',
            'ix_product_cluster_state',
            'ix_product_cluster_hierarchy_level',
            'ix_product_cluster_slug',
        ]
        for idx in indexes:
            idx_exists = conn.execute(
                sa.text(f"""
                    SELECT EXISTS (
                        SELECT FROM pg_indexes
                        WHERE indexname = '{idx}'
                    )
                """)
            ).scalar()
            if idx_exists:
                op.drop_index(idx, table_name='product_cluster')

        op.drop_table('product_cluster')

    # === 5. ELIMINAR ENUM cluster_state_enum ===
    enum_exists = conn.execute(
        sa.text("""
            SELECT EXISTS (
                SELECT FROM pg_type
                WHERE typname = 'cluster_state_enum'
            )
        """)
    ).scalar()

    if enum_exists:
        conn.execute(sa.text("DROP TYPE cluster_state_enum"))
