# app/services/file_extractor_service.py
"""Service for extracting text from different file types."""
import logging
from typing import Optional
import mimetypes

from fastapi import HTTPException
from youtube_transcript_api import YouTubeTranscriptApi

logger = logging.getLogger(__name__)


class FileExtractorService:
    """Extract text from various file formats: txt, md, pdf, youtube."""
    
    SUPPORTED_EXTENSIONS = {
        ".txt": "text/plain",
        ".md": "text/markdown",
        ".pdf": "application/pdf",
        ".youtube": "video/youtube",
        ".youtube_url": "video/youtube",
    }
    
    @staticmethod
    def is_supported(filename: str) -> bool:
        """Check if file type is supported."""
        _, ext = _get_name_and_ext(filename)
        return ext.lower() in FileExtractorService.SUPPORTED_EXTENSIONS
    
    @staticmethod
    def extract_text(filename: str, content: bytes) -> str:
        """
        Extract text from file content based on file type.
        
        Args:
            filename: Name of the file
            content: Binary content of the file
            
        Returns:
            Extracted text as string
            
        Raises:
            HTTPException: If file type not supported or extraction fails
        """
        name, ext = _get_name_and_ext(filename)
        ext = ext.lower()
        
        if ext not in FileExtractorService.SUPPORTED_EXTENSIONS:
            raise HTTPException(
                status_code=415,
                detail=f"Unsupported file type: {ext}. Supported: {list(FileExtractorService.SUPPORTED_EXTENSIONS.keys())}"
            )
        
        try:
            if ext == ".txt":
                return _extract_text_file(content)
            elif ext == ".md":
                return _extract_markdown_file(content)
            elif ext == ".pdf":
                return _extract_pdf_file(content)
            elif ext in {".youtube", ".youtube_url"}:
                # For youtube files, content is the URL
                url = content.decode("utf-8").strip()
                if not url:
                    raise HTTPException(
                        status_code=400,
                        detail="YouTube file is empty. Please provide a valid YouTube URL."
                    )
                return _extract_youtube_transcript(url)
            else:
                raise HTTPException(status_code=415, detail=f"Unsupported file type: {ext}")
        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to extract text from %s: %s", filename, e)
            raise HTTPException(status_code=400, detail=f"Failed to extract text: {str(e)}")


def _get_name_and_ext(filename: str) -> tuple[str, str]:
    """Split filename into name and extension."""
    if "." in filename:
        parts = filename.rsplit(".", 1)
        return parts[0], "." + parts[1]
    return filename, ""


def _extract_text_file(content: bytes) -> str:
    """Extract text from .txt file."""
    try:
        return content.decode("utf-8")
    except UnicodeDecodeError:
        raise HTTPException(
            status_code=415,
            detail="Text file must be UTF-8 encoded"
        )


def _extract_markdown_file(content: bytes) -> str:
    """Extract text from .md file."""
    try:
        return content.decode("utf-8")
    except UnicodeDecodeError:
        raise HTTPException(
            status_code=415,
            detail="Markdown file must be UTF-8 encoded"
        )


def _extract_pdf_file(content: bytes) -> str:
    """Extract text from PDF file."""
    try:
        import PyPDF2
    except ImportError:
        raise HTTPException(
            status_code=503,
            detail="PDF support not installed. Install PyPDF2: pip install PyPDF2"
        )
    
    try:
        from io import BytesIO
        pdf_reader = PyPDF2.PdfReader(BytesIO(content))
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"
        
        if not text.strip():
            raise HTTPException(
                status_code=400,
                detail="PDF contains no extractable text"
            )
        return text
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(
            status_code=400,
            detail=f"Failed to extract PDF text: {str(e)}"
        )


def _extract_youtube_transcript(youtube_url: str) -> str:
    """Extract transcript from YouTube video URL."""
    try:
        from youtube_transcript_api import YouTubeTranscriptApi
    except ImportError:
        raise HTTPException(
            status_code=503,
            detail="YouTube support not installed. Install youtube-transcript-api: pip install youtube-transcript-api"
        )
    
    try:
        # Extract video ID from URL
        video_id = _extract_youtube_video_id(youtube_url)
        if not video_id:
            raise HTTPException(
                status_code=400,
                detail="Invalid YouTube URL. Use: https://www.youtube.com/watch?v=VIDEO_ID or https://youtu.be/VIDEO_ID"
            )

        # Fetch transcript (returns FetchedTranscript object)
        # Use the fetch() method (requires creating an instance)
        api = YouTubeTranscriptApi()
        
        # Try to fetch English transcript first, then fall back to any available language
        try:
            transcript = api.fetch(video_id, languages=['en'])
        except Exception as e:
            # English not available, try to get any available transcript
            try:
                transcript_list = api.list(video_id)
                # Try to find any available transcript (prefer manually created)
                try:
                    transcript_obj = transcript_list.find_transcript(['en'])
                except:
                    # English not available, get first available
                    try:
                        transcript_obj = transcript_list.find_manually_created_transcript(['en'])
                    except:
                        # No manually created English, try generated
                        try:
                            transcript_obj = transcript_list.find_generated_transcript(['en'])
                        except:
                            # Fall back to any language (prefer finding first available)
                            # Get the first available transcript regardless of language
                            all_transcripts = list(transcript_list)
                            if not all_transcripts:
                                raise Exception("No transcripts available for this video")
                            transcript_obj = all_transcripts[0]
                
                # Fetch the transcript
                transcript = transcript_obj.fetch()
                
                # Try to translate to English if not already in English and translation available
                if transcript_obj.language_code != 'en':
                    try:
                        if transcript_obj.is_translatable:
                            logger.info("Translating %s transcript to English", transcript_obj.language_code)
                            translated_transcript = transcript_obj.translate('en')
                            transcript = translated_transcript.fetch()
                    except Exception as trans_error:
                        # Translation failed (likely IP rate limiting), use original language
                        logger.warning("Could not translate %s to English, using original: %s", 
                                      transcript_obj.language_code, trans_error)
                        # Keep original transcript
                        pass
            except Exception as fallback_error:
                # If fallback also fails, provide helpful error
                logger.error("Failed to fetch YouTube transcript for %s: %s", video_id, fallback_error)
                raise HTTPException(
                    status_code=400,
                    detail=f"Failed to extract YouTube transcript: {str(fallback_error)}"
                )
        
        # Convert to raw data and combine transcript entries
        # Handle both old and new library versions
        if hasattr(transcript, 'to_raw_data'):
            raw_data = transcript.to_raw_data()
        elif isinstance(transcript, list):
            # Fallback: transcript might directly be a list of dicts
            raw_data = transcript
        else:
            # Fallback: try to iterate and extract text
            raw_data = [{"text": str(item)} for item in transcript]
        
        full_text = "\n".join([entry["text"] for entry in raw_data])
        
        if not full_text.strip():
            raise HTTPException(
                status_code=400,
                detail="YouTube video has no transcript available"
            )
        
        return full_text
    except HTTPException:
        raise
    except Exception as e:
        logger.error("Failed to extract YouTube transcript: %s", e)
        raise HTTPException(
            status_code=400,
            detail=f"Failed to extract YouTube transcript: {str(e)}"
        )


def _extract_youtube_video_id(url: str) -> Optional[str]:
    """Extract video ID from YouTube URL."""
    import re
    
    # Pattern for youtube.com/watch?v=VIDEO_ID
    match = re.search(r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})", url)
    if match:
        return match.group(1)
    
    # Pattern for short URLs like youtu.be/VIDEO_ID
    if "youtu.be/" in url:
        return url.split("youtu.be/")[-1].split("?")[0]
    
    return None