# backend/app/services/dropzone_watcher.py
"""
Dropzone File Watcher Service (Pivot 2026).

Monitors configured folders for new ERP export files and automatically
triggers the processing pipeline.

FEATURES:
- Watches multiple dropzone folders
- Auto-detects file type (CSV, Excel) and ERP format
- Creates FileUpload records for tracking
- Routes to existing enrichment pipeline
- Debouncing to handle file copy completion

CONFIGURATION (environment variables):
- DROPZONE_ENABLED: Enable/disable watching (default: false)
- DROPZONE_PATHS: Comma-separated list of folder paths to watch
- DROPZONE_POLL_INTERVAL: Polling interval in seconds (default: 5)
- DROPZONE_DEBOUNCE_SECONDS: Wait time after file change (default: 2)

USAGE:
    from app.services.dropzone_watcher import dropzone_watcher

    # Start watching (typically in main.py startup)
    dropzone_watcher.start()

    # Stop watching (shutdown)
    dropzone_watcher.stop()
"""
import os
import threading
import time
import uuid
from datetime import datetime
from pathlib import Path
from typing import Optional

import structlog
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import NullPool

logger = structlog.get_logger(__name__)

# Try to import watchdog (optional dependency for local installations)
try:
    from watchdog.observers import Observer
    from watchdog.events import FileSystemEventHandler, FileCreatedEvent, FileModifiedEvent
    WATCHDOG_AVAILABLE = True
except ImportError:
    WATCHDOG_AVAILABLE = False
    logger.warning("[DROPZONE] watchdog library not installed - dropzone watching disabled")


# Configuration from environment
DROPZONE_ENABLED = os.getenv("DROPZONE_ENABLED", "false").lower() == "true"
DROPZONE_PATHS = [p.strip() for p in os.getenv("DROPZONE_PATHS", "").split(",") if p.strip()]
DROPZONE_POLL_INTERVAL = int(os.getenv("DROPZONE_POLL_INTERVAL", "5"))
DROPZONE_DEBOUNCE_SECONDS = float(os.getenv("DROPZONE_DEBOUNCE_SECONDS", "2"))

# Supported file extensions
# - ERP exports: .csv, .xlsx, .xls, .txt
# - Invoices/documents: .pdf
# - Bank statements: .xlsx, .xls
# Future: OCR processing for PDFs, specialized parsers for bank formats
SUPPORTED_EXTENSIONS = {".csv", ".xlsx", ".xls", ".txt", ".pdf"}


def _validate_path_in_dropzone(filepath: str, dropzone_path: str) -> bool:
    """
    Validate that resolved filepath is within dropzone directory.

    Prevents path traversal attacks via symlinks or relative paths.

    Args:
        filepath: Path to validate
        dropzone_path: Allowed dropzone directory

    Returns:
        True if path is within dropzone, False otherwise
    """
    try:
        resolved_file = Path(filepath).resolve()
        resolved_dropzone = Path(dropzone_path).resolve()

        # Check if file is under dropzone
        resolved_file.relative_to(resolved_dropzone)
        return True
    except ValueError:
        # relative_to() raises ValueError if path is not relative
        logger.error(
            "[DROPZONE] Path traversal attempt detected",
            filepath=filepath,
            dropzone=dropzone_path,
        )
        return False


def _sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to prevent path traversal in archive operations.

    Args:
        filename: Original filename

    Returns:
        Safe filename without path separators or traversal patterns
    """
    # Remove path separators and traversal patterns
    safe = filename.replace("\\", "_").replace("/", "_").replace("..", "_")

    # Remove leading dots (hidden files on Unix)
    while safe.startswith("."):
        safe = safe[1:]

    # Ensure we have a valid filename
    if not safe:
        safe = f"unnamed_{uuid.uuid4().hex[:8]}"

    return safe


class DropzoneFileHandler(FileSystemEventHandler if WATCHDOG_AVAILABLE else object):
    """
    Event handler for file system events in dropzone folders.

    Implements debouncing to wait for file copy completion before processing.
    """

    def __init__(self, watcher: "DropzoneWatcher", dropzone_path: str):
        if WATCHDOG_AVAILABLE:
            super().__init__()
        self.watcher = watcher
        self.dropzone_path = dropzone_path
        self._pending_files: dict[str, float] = {}  # filepath -> last_modified_time
        self._lock = threading.Lock()

    def on_created(self, event):
        """Handle file creation events."""
        if event.is_directory:
            return
        self._schedule_processing(event.src_path)

    def on_modified(self, event):
        """Handle file modification events (for copy completion detection)."""
        if event.is_directory:
            return
        self._schedule_processing(event.src_path)

    def _schedule_processing(self, filepath: str):
        """Schedule file for processing with debouncing."""
        filepath = str(Path(filepath).resolve())

        # Security: Validate path is within dropzone
        if not _validate_path_in_dropzone(filepath, self.dropzone_path):
            logger.warning(
                "[DROPZONE] Rejected file outside dropzone",
                filepath=filepath,
                dropzone=self.dropzone_path,
            )
            return

        ext = Path(filepath).suffix.lower()

        if ext not in SUPPORTED_EXTENSIONS:
            logger.debug(
                "[DROPZONE] Ignoring file with unsupported extension",
                filepath=filepath,
                extension=ext,
            )
            return

        with self._lock:
            self._pending_files[filepath] = time.time()
            logger.info(
                "[DROPZONE] File detected, scheduling processing",
                filepath=filepath,
                debounce_seconds=DROPZONE_DEBOUNCE_SECONDS,
            )

    def get_ready_files(self) -> list[str]:
        """
        Get files that are ready for processing (debounce period elapsed).

        Returns:
            List of file paths ready to process
        """
        ready = []
        now = time.time()

        with self._lock:
            expired_keys = []
            for filepath, last_modified in self._pending_files.items():
                if now - last_modified >= DROPZONE_DEBOUNCE_SECONDS:
                    # File hasn't been modified for debounce period - ready to process
                    if os.path.exists(filepath):
                        ready.append(filepath)
                    expired_keys.append(filepath)

            # Remove processed entries
            for key in expired_keys:
                del self._pending_files[key]

        return ready


class DropzoneWatcher:
    """
    Main dropzone watcher service.

    Monitors configured folders for new ERP files and triggers processing.
    Thread-safe singleton pattern for use across the application.
    """

    def __init__(self):
        self._observer: Optional["Observer"] = None
        self._handlers: list[DropzoneFileHandler] = []
        self._processing_thread: Optional[threading.Thread] = None
        self._stop_event = threading.Event()
        self._running = False
        self._lock = threading.Lock()

        # Statistics
        self._stats = {
            "files_detected": 0,
            "files_processed": 0,
            "files_failed": 0,
            "last_file_at": None,
            "started_at": None,
        }

        # Database session factory (independent from main app)
        self._engine = None
        self._SessionLocal = None

    def _init_db(self):
        """Initialize database connection for dropzone processing."""
        if self._engine is not None:
            return

        database_url = os.getenv(
            "DATABASE_URL",
            "sqlite:///data/kaifarma.db"
        )

        # SQLite needs check_same_thread=False for multi-threaded access
        connect_args = {}
        if database_url.startswith("sqlite"):
            connect_args["check_same_thread"] = False

        self._engine = create_engine(
            database_url,
            poolclass=NullPool,
            connect_args=connect_args,
        )
        self._SessionLocal = sessionmaker(
            autocommit=False,
            autoflush=False,
            bind=self._engine,
        )

    def start(self) -> bool:
        """
        Start watching dropzone folders.

        Returns:
            True if started successfully, False otherwise
        """
        if not WATCHDOG_AVAILABLE:
            logger.warning("[DROPZONE] Cannot start - watchdog library not installed")
            return False

        if not DROPZONE_ENABLED:
            logger.info("[DROPZONE] Dropzone watching disabled (DROPZONE_ENABLED=false)")
            return False

        if not DROPZONE_PATHS:
            logger.warning("[DROPZONE] No dropzone paths configured (DROPZONE_PATHS empty)")
            return False

        with self._lock:
            if self._running:
                logger.warning("[DROPZONE] Already running")
                return True

            try:
                self._init_db()
                self._observer = Observer()
                self._handlers = []

                # Set up watches for each configured path
                for path in DROPZONE_PATHS:
                    path = Path(path).resolve()

                    if not path.exists():
                        logger.warning(
                            "[DROPZONE] Creating dropzone folder",
                            path=str(path),
                        )
                        path.mkdir(parents=True, exist_ok=True)

                    if not path.is_dir():
                        logger.error(
                            "[DROPZONE] Path is not a directory",
                            path=str(path),
                        )
                        continue

                    handler = DropzoneFileHandler(self, str(path))
                    self._handlers.append(handler)
                    self._observer.schedule(handler, str(path), recursive=False)

                    logger.info(
                        "[DROPZONE] Watching folder",
                        path=str(path),
                    )

                if not self._handlers:
                    logger.error("[DROPZONE] No valid dropzone paths to watch")
                    return False

                # Start observer
                self._observer.start()

                # Start processing thread
                self._stop_event.clear()
                self._processing_thread = threading.Thread(
                    target=self._processing_loop,
                    daemon=True,
                    name="dropzone-processor",
                )
                self._processing_thread.start()

                self._running = True
                self._stats["started_at"] = datetime.utcnow().isoformat()

                logger.info(
                    "[DROPZONE] Watcher started",
                    paths=DROPZONE_PATHS,
                    poll_interval=DROPZONE_POLL_INTERVAL,
                    debounce_seconds=DROPZONE_DEBOUNCE_SECONDS,
                )

                return True

            except Exception as e:
                logger.error(
                    "[DROPZONE] Failed to start watcher",
                    error=str(e),
                    error_type=type(e).__name__,
                )
                self._cleanup()
                return False

    def stop(self):
        """Stop watching dropzone folders."""
        with self._lock:
            if not self._running:
                return

            logger.info("[DROPZONE] Stopping watcher...")

            # Signal processing thread to stop
            self._stop_event.set()

            # Stop observer
            if self._observer:
                self._observer.stop()
                self._observer.join(timeout=5)

            # Wait for processing thread
            if self._processing_thread and self._processing_thread.is_alive():
                self._processing_thread.join(timeout=5)

            self._cleanup()
            self._running = False

            logger.info("[DROPZONE] Watcher stopped")

    def _cleanup(self):
        """Clean up resources."""
        self._observer = None
        self._processing_thread = None
        self._handlers = []

    def _processing_loop(self):
        """
        Main processing loop that checks for ready files.

        Runs in a separate thread, checking handlers for files
        that have passed the debounce period.
        """
        logger.info("[DROPZONE] Processing loop started")

        while not self._stop_event.is_set():
            try:
                # Check each handler for ready files
                for handler in self._handlers:
                    ready_files = handler.get_ready_files()

                    for filepath in ready_files:
                        self._process_file(filepath, handler.dropzone_path)

            except Exception as e:
                logger.error(
                    "[DROPZONE] Error in processing loop",
                    error=str(e),
                    error_type=type(e).__name__,
                )

            # Wait before next check
            self._stop_event.wait(timeout=DROPZONE_POLL_INTERVAL)

        logger.info("[DROPZONE] Processing loop stopped")

    def _process_file(self, filepath: str, dropzone_path: str):
        """
        Process a single file from the dropzone.

        Creates a FileUpload record and routes to the processing pipeline.

        Args:
            filepath: Full path to the file
            dropzone_path: The dropzone folder it came from
        """
        from app.models import FileUpload, UploadStatus, Pharmacy
        from app.parsers.file_type_detector import detect_file_content_type_from_path
        from app.services.file_processing_service import file_processing_service

        self._stats["files_detected"] += 1
        self._stats["last_file_at"] = datetime.utcnow().isoformat()

        logger.info(
            "[DROPZONE] Processing file",
            filepath=filepath,
            dropzone_path=dropzone_path,
        )

        db = self._SessionLocal()
        try:
            # Get the single pharmacy (local single-tenant)
            pharmacy = db.query(Pharmacy).first()
            if not pharmacy:
                logger.error(
                    "[DROPZONE] No pharmacy found - cannot process file",
                    filepath=filepath,
                )
                self._stats["files_failed"] += 1
                return

            # CRITICAL: Eager load attributes BEFORE spawning thread
            # Prevents DetachedInstanceError when thread accesses lazy-loaded attrs
            pharmacy_id = str(pharmacy.id)
            erp_type = pharmacy.erp_type  # Force load now

            # Read file contents with retry (handle file still being written)
            contents = None
            max_read_attempts = 3
            for attempt in range(max_read_attempts):
                try:
                    with open(filepath, "rb") as f:
                        contents = f.read()
                    break
                except (PermissionError, OSError) as e:
                    if attempt < max_read_attempts - 1:
                        logger.warning(
                            "[DROPZONE] File locked, retrying",
                            filepath=filepath,
                            attempt=attempt + 1,
                            error=str(e),
                        )
                        time.sleep(1)  # Wait before retry
                    else:
                        logger.error(
                            "[DROPZONE] Failed to read file after retries",
                            filepath=filepath,
                            error=str(e),
                        )
                        self._stats["files_failed"] += 1
                        return
                except Exception as e:
                    logger.error(
                        "[DROPZONE] Failed to read file",
                        filepath=filepath,
                        error=str(e),
                    )
                    self._stats["files_failed"] += 1
                    return

            if contents is None:
                self._stats["files_failed"] += 1
                return

            # Detect file type (SALES vs INVENTORY vs OTHER)
            # Note: PDFs and other documents will return UNKNOWN and need
            # separate handling in future (OCR, specialized parsers)
            content_type = detect_file_content_type_from_path(filepath)

            # Create FileUpload record
            upload_id = str(uuid.uuid4())
            filename = _sanitize_filename(Path(filepath).name)

            upload = FileUpload(
                id=upload_id,
                pharmacy_id=pharmacy_id,
                filename=filename,  # Required field
                original_filename=Path(filepath).name,  # Keep original for reference
                file_path=filepath,
                file_size=len(contents),
                status=UploadStatus.QUEUED,
                processing_notes=f"[DROPZONE] Auto-detected from: {dropzone_path}",
            )
            db.add(upload)
            db.commit()

            logger.info(
                "[DROPZONE] Created FileUpload record",
                upload_id=upload_id,
                filename=filename,
                content_type=content_type,
                pharmacy_id=pharmacy_id,
            )

            # Route to processing pipeline (in background thread)
            # This matches the existing upload API behavior
            processing_thread = threading.Thread(
                target=file_processing_service.process_file_in_thread,
                args=(
                    upload_id,
                    filepath,
                    contents,
                    pharmacy_id,  # Already converted to string
                    erp_type,  # Already loaded from session
                ),
                daemon=True,
                name=f"dropzone-process-{upload_id[:8]}",
            )
            processing_thread.start()

            self._stats["files_processed"] += 1

            # Move file to processed folder (optional)
            self._archive_file(filepath, dropzone_path)

        except Exception as e:
            logger.error(
                "[DROPZONE] Error processing file",
                filepath=filepath,
                error=str(e),
                error_type=type(e).__name__,
            )
            self._stats["files_failed"] += 1
            db.rollback()

        finally:
            db.close()

    def _archive_file(self, filepath: str, dropzone_path: str):
        """
        Move processed file to archive folder.

        Creates a 'processed' subfolder in the dropzone and moves the file there
        with a timestamp prefix to avoid conflicts.

        Security: Sanitizes filename to prevent path traversal attacks.

        Args:
            filepath: Full path to the processed file
            dropzone_path: The dropzone folder
        """
        try:
            processed_dir = Path(dropzone_path) / "processed"
            processed_dir.mkdir(exist_ok=True)

            # Security: Sanitize filename to prevent path traversal
            filename = _sanitize_filename(Path(filepath).name)
            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
            new_filename = f"{timestamp}_{filename}"
            new_path = processed_dir / new_filename

            # Security: Final validation - ensure target is within processed_dir
            if not new_path.resolve().is_relative_to(processed_dir.resolve()):
                logger.error(
                    "[DROPZONE] Archive path traversal blocked",
                    original=filepath,
                    attempted=str(new_path),
                )
                return

            # Move file
            Path(filepath).rename(new_path)

            logger.info(
                "[DROPZONE] Archived processed file",
                original=filepath,
                archived=str(new_path),
            )

        except Exception as e:
            logger.warning(
                "[DROPZONE] Failed to archive file (will remain in dropzone)",
                filepath=filepath,
                error=str(e),
            )

    def get_status(self) -> dict:
        """
        Get current watcher status and statistics.

        Returns:
            Dictionary with status info
        """
        return {
            "enabled": DROPZONE_ENABLED,
            "running": self._running,
            "watchdog_available": WATCHDOG_AVAILABLE,
            "paths": DROPZONE_PATHS,
            "poll_interval_seconds": DROPZONE_POLL_INTERVAL,
            "debounce_seconds": DROPZONE_DEBOUNCE_SECONDS,
            "stats": self._stats.copy(),
        }

    def is_running(self) -> bool:
        """Check if watcher is running."""
        return self._running


# Singleton instance
dropzone_watcher = DropzoneWatcher()
