﻿# backend/app/services/cima_sync_monitor.py
"""
Servicio consolidado de monitoreo automático para sincronización CIMA

Este servicio reemplaza y consolida el antiguo CimaWatchdogService.
Proporciona monitoreo continuo de sincronización CIMA con auto-recovery
ante estancamientos detectados.

Migración completada en Issue #109 - Fase 1 (2025-10-03)

Funcionalidades principales:
- Heartbeat tracking: Monitoreo del progreso de sincronización
- Stall detection: Detección automática de estancamientos por timeout
- Auto-recovery: Reinicio automático con límites de seguridad
- Render optimization: Timeouts adaptados para producción

Ver documentación completa en: docs/SYSTEM_MONITORING.md
"""

import asyncio
import json
import os
import time
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional, Protocol, Tuple

import structlog
from sqlalchemy.orm import Session

from app.core.cache_config import get_redis, is_cache_available

# Issue #114: Importaciones de componentes de resiliencia y observabilidad
from app.services.cima_circuit_breaker import CimaSyncCircuitBreaker
from app.services.cima_retry_strategy import (
    MAX_RECOVERY_ATTEMPTS,
    calculate_backoff,
    log_retry_attempt,
    should_retry,
)
from app.services.heartbeat_tracker import HeartbeatTracker
from app.utils.datetime_utils import utc_now

logger = structlog.get_logger()

# Issue #114 - Fase 3: Importar funciones de métricas Prometheus
try:
    from app.api.metrics import record_recovery_attempt, record_stall_detected, record_sync_failure, record_sync_success

    METRICS_AVAILABLE = True
except ImportError:
    # Graceful degradation si prometheus-client no está disponible
    METRICS_AVAILABLE = False
    logger.warning("metrics.prometheus.unavailable", message="Prometheus metrics disabled")


class SyncRestarter(Protocol):
    """
    Protocol para dependency inversion de sync restart.
    Issue #116: Eliminar imports circulares usando Dependency Inversion Principle.
    """

    def trigger_catalog_sync(
        self, force_refresh: bool = True, enable_chunked_processing: bool = True, is_recovery_mode: bool = True
    ) -> bool:
        """
        Trigger sincronización de catálogo CIMA.

        Args:
            force_refresh: Forzar refresh completo del catálogo
            enable_chunked_processing: Habilitar procesamiento por chunks
            is_recovery_mode: Indicar que es un recovery automático

        Returns:
            bool: True si se inició correctamente
        """
        ...


class CimaSyncStatus(Enum):
    """Estados del monitor de sincronización CIMA"""

    IDLE = "idle"
    SYNCING = "syncing"
    STALL = "stall"
    FAILED = "failed"
    RECOVERING = "recovering"
    DISABLED = "disabled"


class CimaSyncMonitorConfig:
    """
    Configuración del monitor de sincronización CIMA.

    Timeouts y límites optimizados para balance entre detección temprana
    de problemas y prevención de falsos positivos:

    - STALL_DETECTION_TIMEOUT (20 min): Basado en observación de syncs exitosas
      que toman ~15 min. 20 min da margen sin ser excesivo.
    - RECOVERY_COOLDOWN (4 horas): Previene recovery loops, permite diagnóstico,
      y alinea con ciclo de sincronización típico (2 veces al día).
    - MAX_RECOVERY_ATTEMPTS (3): Balance entre persistencia y protección contra
      loops infinitos de recovery fallidos.
    - RENDER timeouts (15 min, 2 attempts): Ajustado a constraints de 512MB RAM
      y timeouts de request de Render.com.
    """

    # Redis Keys para heartbeat y estado
    REDIS_HEARTBEAT_KEY = "cima:sync:heartbeat"
    REDIS_STATUS_KEY = "cima:sync:status"
    REDIS_METRICS_KEY = "cima:sync:metrics"
    REDIS_RECOVERY_KEY = "cima:sync:recovery"

    # Timeouts y intervalos (en segundos)
    HEARTBEAT_INTERVAL = 60  # Cada 60 segundos actualizar heartbeat
    STALL_DETECTION_TIMEOUT = 20 * 60  # 20 minutos sin progreso = estancado
    MONITOR_CHECK_INTERVAL = 5 * 60  # Verificar cada 5 minutos

    # Límites de auto-recovery
    MAX_RECOVERY_ATTEMPTS = 3  # Máximo 3 intentos de recovery por día
    RECOVERY_COOLDOWN = 4 * 60 * 60  # 4 horas entre intentos
    DAILY_RESET_HOUR = 2  # Reset contadores a las 2 AM

    # Timeouts específicos Render
    RENDER_STALL_TIMEOUT = 15 * 60  # 15 minutos en Render (más agresivo)
    RENDER_MAX_ATTEMPTS = 2  # Menos intentos en Render por recursos limitados

    # Métricas de progreso
    MIN_PROGRESS_ITEMS = 50  # Mínimo 50 items procesados para considerar progreso
    PROGRESS_WINDOW = 10 * 60  # Ventana de 10 minutos para calcular progreso


class CimaSyncHeartbeat:
    """Gestiona el heartbeat de sincronización CIMA"""

    def __init__(self, redis_client=None):
        self.redis = redis_client or get_redis()
        self.config = CimaSyncMonitorConfig()

    def update_heartbeat(
        self,
        processed_items: int,
        total_items: int,
        current_phase: str,
        chunk_number: int = 0,
        chunk_total: int = 1,
        additional_data: Optional[Dict[str, Any]] = None,
    ) -> bool:
        """
        Actualiza el heartbeat con progreso actual

        Args:
            processed_items: Items procesados hasta ahora
            total_items: Total de items a procesar
            current_phase: Fase actual (ej: "downloading_chunk_1")
            chunk_number: Número de chunk actual
            chunk_total: Total de chunks
            additional_data: Datos adicionales (errores, velocidad, etc.)

        Returns:
            bool: True si se actualizó correctamente
        """
        if not self.redis or not is_cache_available():
            logger.warning("monitor.heartbeat.redis_unavailable")
            return False

        try:
            timestamp = int(time.time())
            heartbeat_data = {
                "timestamp": timestamp,
                "processed_items": processed_items,
                "total_items": total_items,
                "current_phase": current_phase,
                "chunk_number": chunk_number,
                "chunk_total": chunk_total,
                "progress_percentage": round((processed_items / max(total_items, 1)) * 100, 2),
                "last_update": utc_now().isoformat(),
            }

            # Agregar datos adicionales si se proporcionan
            if additional_data:
                heartbeat_data.update(additional_data)

            # Guardar heartbeat con TTL de 1 hora
            self.redis.setex(self.config.REDIS_HEARTBEAT_KEY, 3600, json.dumps(heartbeat_data))  # TTL 1 hora

            logger.debug(
                "monitor.heartbeat.updated",
                processed=processed_items,
                total=total_items,
                phase=current_phase,
                percentage=heartbeat_data["progress_percentage"],
            )
            return True

        except Exception as e:
            logger.error(
                "monitor.heartbeat.update_failed", error=str(e)[:200], processed=processed_items, phase=current_phase
            )
            return False

    def get_heartbeat(self) -> Optional[Dict[str, Any]]:
        """Obtiene el último heartbeat"""
        if not self.redis or not is_cache_available():
            return None

        try:
            data = self.redis.get(self.config.REDIS_HEARTBEAT_KEY)
            if data:
                return json.loads(data)
            return None

        except Exception as e:
            logger.error("monitor.heartbeat.get_failed", error=str(e)[:200])
            return None

    def clear_heartbeat(self) -> bool:
        """Limpia el heartbeat (al finalizar sync)"""
        if not self.redis or not is_cache_available():
            return False

        try:
            self.redis.delete(self.config.REDIS_HEARTBEAT_KEY)
            logger.info("monitor.heartbeat.cleared")
            return True

        except Exception as e:
            logger.error("monitor.heartbeat.clear_failed", error=str(e)[:200])
            return False


class CimaSyncStallDetector:
    """Detecta estancamientos en la sincronización CIMA"""

    def __init__(self, redis_client=None):
        self.redis = redis_client or get_redis()
        self.config = CimaSyncMonitorConfig()
        self.heartbeat = CimaSyncHeartbeat(redis_client)

    def is_sync_stalled(self) -> Tuple[bool, Dict[str, Any]]:
        """
        Detecta si la sincronización está estancada

        Returns:
            Tuple[bool, Dict]: (está_estancado, información_diagnóstico)
        """
        heartbeat_data = self.heartbeat.get_heartbeat()
        current_time = int(time.time())

        # No hay heartbeat = no hay sync en progreso
        if not heartbeat_data:
            return False, {"reason": "no_active_sync", "status": "idle"}

        # Calcular tiempo desde último heartbeat
        last_heartbeat_time = heartbeat_data.get("timestamp", 0)
        time_since_heartbeat = current_time - last_heartbeat_time

        # Detectar entorno para ajustar timeouts
        is_render = self._is_render_environment()
        stall_timeout = self.config.RENDER_STALL_TIMEOUT if is_render else self.config.STALL_DETECTION_TIMEOUT

        # Diagnóstico detallado
        diagnosis = {
            "last_heartbeat": datetime.fromtimestamp(last_heartbeat_time).isoformat(),
            "time_since_heartbeat": time_since_heartbeat,
            "stall_timeout": stall_timeout,
            "is_render_environment": is_render,
            "current_phase": heartbeat_data.get("current_phase", "unknown"),
            "processed_items": heartbeat_data.get("processed_items", 0),
            "total_items": heartbeat_data.get("total_items", 0),
            "progress_percentage": heartbeat_data.get("progress_percentage", 0),
        }

        # Determinar si está estancado
        is_stalled = time_since_heartbeat > stall_timeout

        if is_stalled:
            diagnosis["reason"] = "heartbeat_timeout"
            diagnosis["status"] = "stalled"

            # Issue #114 - Fase 3: Logging estructurado con contexto completo
            logger.warning(
                "monitor.stall_detected",
                event_type="stall_detection",
                time_since_heartbeat_seconds=time_since_heartbeat,
                stall_timeout_seconds=stall_timeout,
                phase=diagnosis["current_phase"],
                processed_items=diagnosis["processed_items"],
                total_items=diagnosis["total_items"],
                progress_percentage=diagnosis["progress_percentage"],
                is_render_environment=is_render,
            )

            # Issue #114 - Fase 3: Registrar métrica de stall detectado
            if METRICS_AVAILABLE:
                try:
                    record_stall_detected()
                except Exception as e:
                    logger.error("metrics.record_stall.failed", error=str(e)[:200])
        else:
            diagnosis["reason"] = "healthy"
            diagnosis["status"] = "active"

        return is_stalled, diagnosis

    def _is_render_environment(self) -> bool:
        """Detecta si estamos en Render production"""
        return os.getenv("RENDER") == "true" or "onrender.com" in os.getenv("RENDER_EXTERNAL_URL", "")


class CimaSyncAutoRecovery:
    """
    Gestiona el auto-recovery de sincronización CIMA estancada.
    Issue #116: Refactorizado para usar dependency inversion y eliminar imports circulares.
    Issue #114: Añadido circuit breaker y estrategia de reintentos mejorada.
    """

    def __init__(self, redis_client=None, sync_restarter: Optional[SyncRestarter] = None, db: Optional[Session] = None):
        self.redis = redis_client or get_redis()
        self.config = CimaSyncMonitorConfig()
        self.detector = CimaSyncStallDetector(redis_client)
        self.sync_restarter = sync_restarter

        # Issue #114: Circuit breaker para protección contra fallos repetidos
        self.circuit_breaker = CimaSyncCircuitBreaker(
            failure_threshold=5, timeout_seconds=300, half_open_max_attempts=3  # 5 minutos
        )

        # Issue #114: HeartbeatTracker con fallback a DB
        self.heartbeat_tracker = HeartbeatTracker(db) if db else None

    def can_attempt_recovery(self) -> Tuple[bool, Dict[str, Any]]:
        """
        Verifica si se puede intentar recovery

        Returns:
            Tuple[bool, Dict]: (puede_recovery, información_límites)
        """
        if not self.redis or not is_cache_available():
            return False, {"reason": "redis_unavailable"}

        try:
            recovery_data = self._get_recovery_data()
            current_time = int(time.time())
            current_date = utc_now().date()

            # Reset diario de contadores
            if recovery_data.get("last_reset_date") != current_date.isoformat():
                recovery_data = self._reset_daily_counters()

            # Verificar límite de intentos diarios
            is_render = self._is_render_environment()
            max_attempts = self.config.RENDER_MAX_ATTEMPTS if is_render else self.config.MAX_RECOVERY_ATTEMPTS

            daily_attempts = recovery_data.get("daily_attempts", 0)
            if daily_attempts >= max_attempts:
                return False, {
                    "reason": "daily_limit_reached",
                    "daily_attempts": daily_attempts,
                    "max_attempts": max_attempts,
                }

            # Verificar cooldown
            last_attempt_time = recovery_data.get("last_attempt_time", 0)
            time_since_last_attempt = current_time - last_attempt_time

            if time_since_last_attempt < self.config.RECOVERY_COOLDOWN:
                remaining_cooldown = self.config.RECOVERY_COOLDOWN - time_since_last_attempt
                return False, {
                    "reason": "cooldown_active",
                    "remaining_cooldown_seconds": remaining_cooldown,
                    "remaining_cooldown_minutes": round(remaining_cooldown / 60, 1),
                }

            return True, {
                "reason": "can_attempt",
                "daily_attempts": daily_attempts,
                "max_attempts": max_attempts,
                "time_since_last_attempt": time_since_last_attempt,
            }

        except Exception as e:
            logger.error("monitor.recovery.check_failed", error=str(e)[:200])
            return False, {"reason": "error_checking_limits"}

    def attempt_recovery(self) -> Tuple[bool, Dict[str, Any]]:
        """
        Intenta recovery de la sincronización estancada con circuit breaker y backoff exponencial.
        Issue #114: Mejorado con circuit breaker pattern y estrategia de reintentos.

        Returns:
            Tuple[bool, Dict]: (recovery_exitoso, información_resultado)
        """
        # Issue #114: Verificar circuit breaker primero
        if not self.circuit_breaker.can_attempt():
            circuit_status = self.circuit_breaker.get_status()
            logger.warning(
                "monitor.recovery.circuit_breaker_blocked",
                circuit_state=circuit_status["state"],
                time_until_half_open=circuit_status.get("time_until_half_open_seconds"),
            )
            return False, {"reason": "circuit_breaker_open", "circuit_breaker": circuit_status}

        # Verificar si se puede intentar recovery (límites diarios)
        can_recover, limits_info = self.can_attempt_recovery()
        if not can_recover:
            return False, {"reason": "recovery_not_allowed", "limits": limits_info}

        try:
            # Obtener número de intento actual
            recovery_data = self._get_recovery_data()
            attempt_number = recovery_data.get("daily_attempts", 0) + 1

            # Issue #114 - Fase 3: Logging estructurado con contexto completo de reintento
            log_retry_attempt(
                attempt=attempt_number,
                reason="cima_sync_stall_detected",
                additional_context={
                    "circuit_breaker_state": self.circuit_breaker.state.value,
                    "circuit_breaker_failures": self.circuit_breaker.failure_count,
                    "limits": limits_info,
                    "recovery_attempts_today": recovery_data.get("daily_attempts", 0),
                },
            )

            # Registrar intento de recovery
            self._record_recovery_attempt()

            # Limpiar estado previo
            self._clear_stalled_state()

            # Intentar restart de sincronización via API interna
            recovery_success = self._trigger_sync_restart()

            # Issue #114: Actualizar circuit breaker según resultado
            if recovery_success:
                self.circuit_breaker.record_success()
                logger.info(
                    "monitor.recovery.successful",
                    attempt=attempt_number,
                    circuit_state=self.circuit_breaker.state.value,
                )

                # Issue #114 - Fase 3: Registrar métrica de recovery exitoso
                if METRICS_AVAILABLE:
                    try:
                        record_recovery_attempt(success=True)
                    except Exception as e:
                        logger.error("metrics.record_recovery.failed", error=str(e)[:200])
            else:
                self.circuit_breaker.record_failure()
                logger.error(
                    "monitor.recovery.failed", attempt=attempt_number, circuit_state=self.circuit_breaker.state.value
                )

                # Issue #114 - Fase 3: Registrar métrica de recovery fallido
                if METRICS_AVAILABLE:
                    try:
                        record_recovery_attempt(success=False)
                    except Exception as e:
                        logger.error("metrics.record_recovery.failed", error=str(e)[:200])

            # Issue #114: Calcular próximo intento con backoff si es necesario
            next_retry_delay = None
            if not recovery_success and should_retry(attempt_number):
                next_retry_delay = calculate_backoff(attempt_number + 1)

            result = {
                "recovery_attempted": True,
                "recovery_successful": recovery_success,
                "attempt_number": attempt_number,
                "attempt_time": utc_now().isoformat(),
                "limits": limits_info,
                "circuit_breaker": self.circuit_breaker.get_status(),
                "next_retry_delay_seconds": next_retry_delay,
            }

            return recovery_success, result

        except Exception as e:
            # Issue #114: Registrar fallo en circuit breaker
            self.circuit_breaker.record_failure()
            logger.error(
                "monitor.recovery.attempt_failed", error=str(e)[:200], circuit_state=self.circuit_breaker.state.value
            )
            return False, {
                "reason": "recovery_exception",
                "error": str(e)[:200],
                "circuit_breaker": self.circuit_breaker.get_status(),
            }

    def _get_recovery_data(self) -> Dict[str, Any]:
        """Obtiene datos de recovery desde Redis"""
        try:
            data = self.redis.get(self.config.REDIS_RECOVERY_KEY)
            if data:
                return json.loads(data)
            return {}

        except Exception as e:
            logger.error("monitor.recovery.get_data_failed", error=str(e)[:200])
            return {}

    def _reset_daily_counters(self) -> Dict[str, Any]:
        """Reset contadores diarios"""
        current_date = utc_now().date()
        recovery_data = {"daily_attempts": 0, "last_reset_date": current_date.isoformat(), "last_attempt_time": 0}

        try:
            self.redis.setex(self.config.REDIS_RECOVERY_KEY, 86400 * 2, json.dumps(recovery_data))  # TTL 2 días
            logger.info("monitor.recovery.daily_reset", date=current_date.isoformat())

        except Exception as e:
            logger.error("monitor.recovery.reset_failed", error=str(e)[:200])

        return recovery_data

    def _record_recovery_attempt(self) -> None:
        """Registra un intento de recovery"""
        try:
            recovery_data = self._get_recovery_data()
            recovery_data["daily_attempts"] = recovery_data.get("daily_attempts", 0) + 1
            recovery_data["last_attempt_time"] = int(time.time())

            self.redis.setex(self.config.REDIS_RECOVERY_KEY, 86400 * 2, json.dumps(recovery_data))  # TTL 2 días

        except Exception as e:
            logger.error("monitor.recovery.record_failed", error=str(e)[:200])

    def _clear_stalled_state(self) -> None:
        """Limpia el estado de sync estancado"""
        try:
            # Limpiar heartbeat antiguo
            self.redis.delete(self.config.REDIS_HEARTBEAT_KEY)

            # Limpiar métricas de sync previo
            self.redis.delete(self.config.REDIS_METRICS_KEY)

            logger.info("monitor.recovery.state_cleared")

        except Exception as e:
            logger.error("monitor.recovery.clear_state_failed", error=str(e)[:200])

    def _trigger_sync_restart(self) -> bool:
        """
        Trigger restart de sincronización CIMA usando dependency injection.
        Issue #116: Refactorizado para eliminar imports circulares.
        """
        if not self.sync_restarter:
            logger.warning(
                "monitor.recovery.no_restarter", message="SyncRestarter no configurado, recovery no disponible"
            )
            return False

        try:
            logger.info("monitor.recovery.triggering_restart")

            # Usar sync_restarter inyectado (Dependency Inversion)
            return self.sync_restarter.trigger_catalog_sync(
                force_refresh=True, enable_chunked_processing=True, is_recovery_mode=True
            )

        except Exception as e:
            logger.error("monitor.recovery.restart_failed", error=str(e)[:200])
            return False

    def _is_render_environment(self) -> bool:
        """Detecta si estamos en Render production"""
        return os.getenv("RENDER") == "true" or "onrender.com" in os.getenv("RENDER_EXTERNAL_URL", "")


class CimaSyncMonitor:
    """
    Monitor principal para sincronización CIMA con auto-recovery.

    Issue #109: Reemplaza y consolida el antiguo CimaWatchdogService.
    Issue #116: Refactorizado con dependency inversion para eliminar imports circulares.
    """

    def __init__(self, redis_client=None, sync_restarter: Optional[SyncRestarter] = None):
        self.redis = redis_client or get_redis()
        self.config = CimaSyncMonitorConfig()
        self.heartbeat = CimaSyncHeartbeat(self.redis)
        self.detector = CimaSyncStallDetector(self.redis)
        self.recovery = CimaSyncAutoRecovery(self.redis, sync_restarter)
        self._monitoring_active = False

        # Redis availability caching (migrado desde CimaWatchdogService)
        self._last_redis_check = None
        self._redis_available_cache = False
        self._cache_ttl_seconds = 30  # Cache availability for 30 seconds

    def start_monitoring(self) -> bool:
        """Inicia el monitoreo automático de CIMA sync"""
        if self._monitoring_active:
            logger.warning("monitor.already_active")
            return True

        if not self.redis or not is_cache_available():
            logger.error("monitor.start_failed.redis_unavailable")
            return False

        try:
            self._monitoring_active = True
            logger.info("monitor.started", check_interval=self.config.MONITOR_CHECK_INTERVAL)

            # Iniciar loop de monitoreo en background
            import threading

            monitor_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
            monitor_thread.start()

            return True

        except Exception as e:
            logger.error("monitor.start_failed", error=str(e)[:200])
            self._monitoring_active = False
            return False

    def stop_monitoring(self) -> None:
        """Detiene el monitoreo"""
        self._monitoring_active = False
        logger.info("monitor.stopped")

    def _monitoring_loop(self) -> None:
        """Loop principal de monitoreo"""
        logger.info("monitor.loop.started")

        while self._monitoring_active:
            try:
                # Verificar si hay estancamiento
                is_stalled, diagnosis = self.detector.is_sync_stalled()

                if is_stalled:
                    logger.warning("monitor.stall_detected.attempting_recovery", diagnosis=diagnosis)

                    # Intentar auto-recovery
                    recovery_success, recovery_info = self.recovery.attempt_recovery()

                    if recovery_success:
                        logger.info("monitor.recovery.successful", info=recovery_info)
                    else:
                        logger.error("monitor.recovery.failed", info=recovery_info)

                # Esperar antes de la siguiente verificación
                time.sleep(self.config.MONITOR_CHECK_INTERVAL)

            except Exception as e:
                logger.error("monitor.loop.error", error=str(e)[:200])
                time.sleep(60)  # Esperar 1 minuto antes de continuar en caso de error

        logger.info("monitor.loop.stopped")

    async def _is_redis_available(self) -> bool:
        """
        Verificar si Redis está disponible y responde.
        Migrado desde CimaWatchdogService para caché de disponibilidad.
        """
        try:
            if not self.redis:
                return False
            # Redis sync API - ejecutar en thread pool para no bloquear
            loop = asyncio.get_event_loop()
            await loop.run_in_executor(None, self.redis.ping)
            return True
        except Exception as e:
            logger.warning("monitor.redis.ping_failed", error=str(e)[:200])
            return False

    async def _is_redis_available_cached(self) -> bool:
        """
        Verificar disponibilidad de Redis con caché para optimizar rendimiento.
        Evita múltiples pings innecesarios en el mismo ciclo de verificación.
        Migrado desde CimaWatchdogService.
        """
        now = utc_now()

        # Check if cache is still valid
        if self._last_redis_check and (now - self._last_redis_check).total_seconds() < self._cache_ttl_seconds:
            return self._redis_available_cache

        # Update cache with fresh check
        self._redis_available_cache = await self._is_redis_available()
        self._last_redis_check = now

        return self._redis_available_cache

    def get_monitor_status(self) -> Dict[str, Any]:
        """
        Obtiene el estado completo del monitor.
        Consolidado con funcionalidad de get_watchdog_stats() del watchdog antiguo.
        Issue #114: Añadido información del circuit breaker y heartbeat tracker.
        """
        heartbeat_data = self.heartbeat.get_heartbeat()
        is_stalled, diagnosis = self.detector.is_sync_stalled()
        can_recover, recovery_limits = self.recovery.can_attempt_recovery()

        # Obtener datos de recovery para estadísticas detalladas
        recovery_data = self.recovery._get_recovery_data()

        # Issue #114: Obtener estado del circuit breaker
        circuit_breaker_status = self.recovery.circuit_breaker.get_status()

        # Issue #114: Obtener estado del heartbeat tracker con fallback
        heartbeat_tracker_status = None
        if self.recovery.heartbeat_tracker:
            heartbeat_tracker_status = self.recovery.heartbeat_tracker.get_status()

        return {
            "monitoring_active": self._monitoring_active,
            "sync_heartbeat": heartbeat_data,
            "stall_detection": {"is_stalled": is_stalled, "diagnosis": diagnosis},
            "recovery_status": {
                "can_attempt_recovery": can_recover,
                "limits": recovery_limits,
                "restart_count_today": recovery_data.get("daily_attempts", 0),
                "last_restart": recovery_data.get("last_attempt_time", None),
                "circuit_breaker": circuit_breaker_status,  # Issue #114
            },
            "heartbeat_tracker": heartbeat_tracker_status,  # Issue #114
            "config": {
                "stall_timeout": self.config.STALL_DETECTION_TIMEOUT,
                "monitor_interval": self.config.MONITOR_CHECK_INTERVAL,
                "max_daily_attempts": self.config.MAX_RECOVERY_ATTEMPTS,
                "max_recovery_attempts_total": MAX_RECOVERY_ATTEMPTS,  # Issue #114
                "recovery_cooldown": self.config.RECOVERY_COOLDOWN,
            },
            "timestamp": utc_now().isoformat(),
        }


# Instancia global del monitor
cima_sync_monitor = CimaSyncMonitor()


def start_cima_monitoring() -> bool:
    """Inicia el monitoreo automático de CIMA (función de conveniencia)"""
    # Check if Redis is disabled via environment variable
    redis_enabled = os.getenv("REDIS_ENABLED", "true").lower() in ("true", "1", "yes")
    redis_url = os.getenv("REDIS_URL") or ""

    if not redis_enabled or not redis_url:
        logger.warning("cima.monitoring.skipped.redis_disabled")
        return False

    # Verificar Redis disponibilidad directamente
    try:
        import redis

        test_client = redis.Redis.from_url(redis_url)
        test_client.ping()
        logger.info("cima.monitoring.redis_verified")

        # Crear monitor con cliente directo para evitar problemas del manager global
        redis_client = redis.Redis.from_url(redis_url)
        global cima_sync_monitor
        cima_sync_monitor = CimaSyncMonitor(redis_client)

    except Exception as e:
        logger.error("cima.monitoring.start_failed.redis_connection", error=str(e)[:200])
        return False

    return cima_sync_monitor.start_monitoring()


def stop_cima_monitoring() -> None:
    """Detiene el monitoreo automático de CIMA (función de conveniencia)"""
    cima_sync_monitor.stop_monitoring()


def get_cima_monitor_status() -> Dict[str, Any]:
    """Obtiene el estado del monitor de CIMA (función de conveniencia)"""
    return cima_sync_monitor.get_monitor_status()


def update_cima_heartbeat(processed_items: int, total_items: int, current_phase: str, **kwargs) -> bool:
    """Actualiza heartbeat de CIMA sync (función de conveniencia)"""
    return cima_sync_monitor.heartbeat.update_heartbeat(
        processed_items=processed_items, total_items=total_items, current_phase=current_phase, **kwargs
    )
