skill-seekers-reference/src/skill_seekers/cli/source_detector.py

"""Source type detection for unified create command.

Auto-detects whether a source is a web URL, GitHub repository,
local directory, PDF file, or config file based on patterns.
"""

import os
import re
from dataclasses import dataclass
from typing import Any
from urllib.parse import urlparse
import logging

logger = logging.getLogger(__name__)


@dataclass
class SourceInfo:
    """Information about a detected source.

    Attributes:
        type: Source type ('web', 'github', 'local', 'pdf', 'config')
        parsed: Parsed source information (e.g., {'url': '...'}, {'repo': '...'})
        suggested_name: Auto-suggested name for the skill
        raw_input: Original user input
    """

    type: str
    parsed: dict[str, Any]
    suggested_name: str
    raw_input: str


class SourceDetector:
    """Detects source type from user input and extracts relevant information."""

    # GitHub repo patterns
    GITHUB_REPO_PATTERN = re.compile(r"^([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)$")
    GITHUB_URL_PATTERN = re.compile(
        r"(?:https?://)?(?:www\.)?github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)(?:\.git)?"
    )

    @classmethod
    def detect(cls, source: str) -> SourceInfo:
        """Detect source type and extract information.

        Args:
            source: User input (URL, path, repo, etc.)

        Returns:
            SourceInfo object with detected type and parsed data

        Raises:
            ValueError: If source type cannot be determined
        """
        # 1. File extension detection
        if source.endswith(".json"):
            return cls._detect_config(source)

        if source.endswith(".pdf"):
            return cls._detect_pdf(source)

        if source.endswith(".docx"):
            return cls._detect_word(source)

        if source.endswith(".epub"):
            return cls._detect_epub(source)

        # Video file extensions
        VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
        if source.lower().endswith(VIDEO_EXTENSIONS):
            return cls._detect_video_file(source)

        # 2. Video URL detection (before directory check)
        video_url_info = cls._detect_video_url(source)
        if video_url_info:
            return video_url_info

        # 3. Directory detection
        if os.path.isdir(source):
            return cls._detect_local(source)

        # 4. GitHub patterns
        github_info = cls._detect_github(source)
        if github_info:
            return github_info

        # 5. URL detection
        if source.startswith("http://") or source.startswith("https://"):
            return cls._detect_web(source)

        # 6. Domain inference (add https://)
        if "." in source and not source.startswith("/"):
            return cls._detect_web(f"https://{source}")

        # 7. Error - cannot determine
        raise ValueError(
            f"Cannot determine source type for: {source}\n\n"
            "Examples:\n"
            "  Web:    skill-seekers create https://docs.react.dev/\n"
            "  GitHub: skill-seekers create facebook/react\n"
            "  Local:  skill-seekers create ./my-project\n"
            "  PDF:    skill-seekers create tutorial.pdf\n"
            "  DOCX:   skill-seekers create document.docx\n"
            "  EPUB:   skill-seekers create ebook.epub\n"
            "  Video:  skill-seekers create https://youtube.com/watch?v=...\n"
            "  Video:  skill-seekers create recording.mp4\n"
            "  Config: skill-seekers create configs/react.json"
        )

    @classmethod
    def _detect_config(cls, source: str) -> SourceInfo:
        """Detect config file source."""
        name = os.path.splitext(os.path.basename(source))[0]
        return SourceInfo(
            type="config", parsed={"config_path": source}, suggested_name=name, raw_input=source
        )

    @classmethod
    def _detect_pdf(cls, source: str) -> SourceInfo:
        """Detect PDF file source."""
        name = os.path.splitext(os.path.basename(source))[0]
        return SourceInfo(
            type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

    @classmethod
    def _detect_word(cls, source: str) -> SourceInfo:
        """Detect Word document (.docx) source."""
        name = os.path.splitext(os.path.basename(source))[0]
        return SourceInfo(
            type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

    @classmethod
    def _detect_epub(cls, source: str) -> SourceInfo:
        """Detect EPUB file source."""
        name = os.path.splitext(os.path.basename(source))[0]
        return SourceInfo(
            type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

    @classmethod
    def _detect_video_file(cls, source: str) -> SourceInfo:
        """Detect local video file source."""
        name = os.path.splitext(os.path.basename(source))[0]
        return SourceInfo(
            type="video",
            parsed={"file_path": source, "source_kind": "file"},
            suggested_name=name,
            raw_input=source,
        )

    @classmethod
    def _detect_video_url(cls, source: str) -> SourceInfo | None:
        """Detect video platform URL (YouTube, Vimeo).

        Returns SourceInfo if the source is a video URL, None otherwise.
        """
        lower = source.lower()

        # YouTube patterns
        youtube_keywords = [
            "youtube.com/watch",
            "youtu.be/",
            "youtube.com/playlist",
            "youtube.com/@",
            "youtube.com/channel/",
            "youtube.com/c/",
            "youtube.com/shorts/",
            "youtube.com/embed/",
        ]
        if any(kw in lower for kw in youtube_keywords):
            # Determine suggested name
            if "playlist" in lower:
                name = "youtube_playlist"
            elif "/@" in lower or "/channel/" in lower or "/c/" in lower:
                name = "youtube_channel"
            else:
                name = "youtube_video"
            return SourceInfo(
                type="video",
                parsed={"url": source, "source_kind": "url"},
                suggested_name=name,
                raw_input=source,
            )

        # Vimeo patterns
        if "vimeo.com/" in lower:
            return SourceInfo(
                type="video",
                parsed={"url": source, "source_kind": "url"},
                suggested_name="vimeo_video",
                raw_input=source,
            )

        return None

    @classmethod
    def _detect_local(cls, source: str) -> SourceInfo:
        """Detect local directory source."""
        # Clean up path
        directory = os.path.abspath(source)
        name = os.path.basename(directory)

        return SourceInfo(
            type="local", parsed={"directory": directory}, suggested_name=name, raw_input=source
        )

    @classmethod
    def _detect_github(cls, source: str) -> SourceInfo | None:
        """Detect GitHub repository source.

        Supports patterns:
        - owner/repo
        - github.com/owner/repo
        - https://github.com/owner/repo
        """
        # Try simple owner/repo pattern first
        match = cls.GITHUB_REPO_PATTERN.match(source)
        if match:
            owner, repo = match.groups()
            return SourceInfo(
                type="github",
                parsed={"repo": f"{owner}/{repo}"},
                suggested_name=repo,
                raw_input=source,
            )

        # Try GitHub URL pattern
        match = cls.GITHUB_URL_PATTERN.search(source)
        if match:
            owner, repo = match.groups()
            # Clean up repo name (remove .git suffix if present)
            if repo.endswith(".git"):
                repo = repo[:-4]
            return SourceInfo(
                type="github",
                parsed={"repo": f"{owner}/{repo}"},
                suggested_name=repo,
                raw_input=source,
            )

        return None

    @classmethod
    def _detect_web(cls, source: str) -> SourceInfo:
        """Detect web documentation source."""
        # Parse URL to extract domain for suggested name
        parsed_url = urlparse(source)
        domain = parsed_url.netloc or parsed_url.path

        # Clean up domain for name suggestion
        # docs.react.dev -> react
        # reactjs.org -> react
        name = domain.replace("www.", "").replace("docs.", "")
        name = name.split(".")[0]  # Take first part before TLD

        return SourceInfo(type="web", parsed={"url": source}, suggested_name=name, raw_input=source)

    @classmethod
    def validate_source(cls, source_info: SourceInfo) -> None:
        """Validate that source is accessible.

        Args:
            source_info: Detected source information

        Raises:
            ValueError: If source is not accessible
        """
        if source_info.type == "local":
            directory = source_info.parsed["directory"]
            if not os.path.exists(directory):
                raise ValueError(f"Directory does not exist: {directory}")
            if not os.path.isdir(directory):
                raise ValueError(f"Path is not a directory: {directory}")

        elif source_info.type == "pdf":
            file_path = source_info.parsed["file_path"]
            if not os.path.exists(file_path):
                raise ValueError(f"PDF file does not exist: {file_path}")
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

        elif source_info.type == "word":
            file_path = source_info.parsed["file_path"]
            if not os.path.exists(file_path):
                raise ValueError(f"Word document does not exist: {file_path}")
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

        elif source_info.type == "epub":
            file_path = source_info.parsed["file_path"]
            if not os.path.exists(file_path):
                raise ValueError(f"EPUB file does not exist: {file_path}")
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

        elif source_info.type == "video":
            if source_info.parsed.get("source_kind") == "file":
                file_path = source_info.parsed["file_path"]
                if not os.path.exists(file_path):
                    raise ValueError(f"Video file does not exist: {file_path}")
                if not os.path.isfile(file_path):
                    raise ValueError(f"Path is not a file: {file_path}")
            # URL-based video sources are validated during processing

        elif source_info.type == "config":
            config_path = source_info.parsed["config_path"]
            if not os.path.exists(config_path):
                raise ValueError(f"Config file does not exist: {config_path}")
            if not os.path.isfile(config_path):
                raise ValueError(f"Path is not a file: {config_path}")

        # For web and github, validation happens during scraping
        # (URL accessibility, repo existence)