"""Source type detection for unified create command. Auto-detects source type from user input — supports web URLs, GitHub repos, local directories, and 14+ file types (PDF, DOCX, EPUB, IPYNB, HTML, YAML/OpenAPI, AsciiDoc, PPTX, RSS/Atom, man pages, video files, and config JSON). Note: Confluence, Notion, and Slack/Discord chat sources are API/export-based and cannot be auto-detected from a single argument. Use their dedicated subcommands (``skill-seekers confluence``, ``notion``, ``chat``) instead. """ import os import re from dataclasses import dataclass from typing import Any from urllib.parse import urlparse import logging logger = logging.getLogger(__name__) @dataclass class SourceInfo: """Information about a detected source. Attributes: type: Source type ('web', 'github', 'local', 'pdf', 'config') parsed: Parsed source information (e.g., {'url': '...'}, {'repo': '...'}) suggested_name: Auto-suggested name for the skill raw_input: Original user input """ type: str parsed: dict[str, Any] suggested_name: str raw_input: str class SourceDetector: """Detects source type from user input and extracts relevant information.""" # GitHub repo patterns GITHUB_REPO_PATTERN = re.compile(r"^([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)$") GITHUB_URL_PATTERN = re.compile( r"(?:https?://)?(?:www\.)?github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)(?:\.git)?" ) @classmethod def detect(cls, source: str) -> SourceInfo: """Detect source type and extract information. Args: source: User input (URL, path, repo, etc.) Returns: SourceInfo object with detected type and parsed data Raises: ValueError: If source type cannot be determined """ # 1. File extension detection if source.endswith(".json"): return cls._detect_config(source) if source.endswith(".pdf"): return cls._detect_pdf(source) if source.endswith(".docx"): return cls._detect_word(source) if source.endswith(".epub"): return cls._detect_epub(source) if source.endswith(".ipynb"): return cls._detect_jupyter(source) if source.lower().endswith((".html", ".htm")): return cls._detect_html(source) if source.endswith(".pptx"): return cls._detect_pptx(source) if source.lower().endswith((".adoc", ".asciidoc")): return cls._detect_asciidoc(source) # Man page file extensions (.1 through .8, .man) # Only match if the basename looks like a man page (e.g., "git.1", not "log.1") # Require basename without the extension to be a plausible command name if source.lower().endswith(".man"): return cls._detect_manpage(source) MAN_SECTION_EXTENSIONS = (".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8") if source.lower().endswith(MAN_SECTION_EXTENSIONS): # Heuristic: man pages have a simple basename (no dots before extension) # e.g., "git.1" is a man page, "access.log.1" is not basename_no_ext = os.path.splitext(os.path.basename(source))[0] if "." not in basename_no_ext: return cls._detect_manpage(source) # Video file extensions VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv") if source.lower().endswith(VIDEO_EXTENSIONS): return cls._detect_video_file(source) # RSS/Atom feed file extensions (only .rss and .atom — .xml is too generic) if source.lower().endswith((".rss", ".atom")): return cls._detect_rss(source) # OpenAPI/Swagger spec detection (YAML files with OpenAPI content) # Sniff file content for 'openapi:' or 'swagger:' keys before committing if ( source.lower().endswith((".yaml", ".yml")) and os.path.isfile(source) and cls._looks_like_openapi(source) ): return cls._detect_openapi(source) # 2. Video URL detection (before directory check) video_url_info = cls._detect_video_url(source) if video_url_info: return video_url_info # 3. Directory detection if os.path.isdir(source): return cls._detect_local(source) # 4. GitHub patterns github_info = cls._detect_github(source) if github_info: return github_info # 5. URL detection if source.startswith("http://") or source.startswith("https://"): return cls._detect_web(source) # 6. Domain inference (add https://) if "." in source and not source.startswith("/"): return cls._detect_web(f"https://{source}") # 7. Error - cannot determine raise ValueError( f"Cannot determine source type for: {source}\n\n" "Examples:\n" " Web: skill-seekers create https://docs.react.dev/\n" " GitHub: skill-seekers create facebook/react\n" " Local: skill-seekers create ./my-project\n" " PDF: skill-seekers create tutorial.pdf\n" " DOCX: skill-seekers create document.docx\n" " EPUB: skill-seekers create ebook.epub\n" " Jupyter: skill-seekers create notebook.ipynb\n" " HTML: skill-seekers create page.html\n" " OpenAPI: skill-seekers create openapi.yaml\n" " AsciiDoc: skill-seekers create document.adoc\n" " PowerPoint: skill-seekers create presentation.pptx\n" " RSS: skill-seekers create feed.rss\n" " Man page: skill-seekers create command.1\n" " Video: skill-seekers create https://youtube.com/watch?v=...\n" " Video: skill-seekers create recording.mp4\n" " Config: skill-seekers create configs/react.json" ) @classmethod def _detect_config(cls, source: str) -> SourceInfo: """Detect config file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="config", parsed={"config_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_pdf(cls, source: str) -> SourceInfo: """Detect PDF file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_word(cls, source: str) -> SourceInfo: """Detect Word document (.docx) source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_epub(cls, source: str) -> SourceInfo: """Detect EPUB file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_jupyter(cls, source: str) -> SourceInfo: """Detect Jupyter Notebook file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="jupyter", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_html(cls, source: str) -> SourceInfo: """Detect local HTML file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="html", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_pptx(cls, source: str) -> SourceInfo: """Detect PowerPoint file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="pptx", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_asciidoc(cls, source: str) -> SourceInfo: """Detect AsciiDoc file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="asciidoc", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_manpage(cls, source: str) -> SourceInfo: """Detect man page file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="manpage", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_rss(cls, source: str) -> SourceInfo: """Detect RSS/Atom feed file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="rss", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _looks_like_openapi(cls, source: str) -> bool: """Check if a YAML/JSON file looks like an OpenAPI or Swagger spec. Reads the first few lines to look for 'openapi:' or 'swagger:' keys. Args: source: Path to the file Returns: True if the file appears to be an OpenAPI/Swagger spec """ try: with open(source, encoding="utf-8", errors="replace") as f: # Read first 20 lines — the openapi/swagger key is always near the top for _ in range(20): line = f.readline() if not line: break stripped = line.strip().lower() if stripped.startswith("openapi:") or stripped.startswith("swagger:"): return True if stripped.startswith('"openapi"') or stripped.startswith('"swagger"'): return True except OSError: pass return False @classmethod def _detect_openapi(cls, source: str) -> SourceInfo: """Detect OpenAPI/Swagger spec file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="openapi", parsed={"file_path": source}, suggested_name=name, raw_input=source ) @classmethod def _detect_video_file(cls, source: str) -> SourceInfo: """Detect local video file source.""" name = os.path.splitext(os.path.basename(source))[0] return SourceInfo( type="video", parsed={"file_path": source, "source_kind": "file"}, suggested_name=name, raw_input=source, ) @classmethod def _detect_video_url(cls, source: str) -> SourceInfo | None: """Detect video platform URL (YouTube, Vimeo). Returns SourceInfo if the source is a video URL, None otherwise. """ lower = source.lower() # YouTube patterns youtube_keywords = [ "youtube.com/watch", "youtu.be/", "youtube.com/playlist", "youtube.com/@", "youtube.com/channel/", "youtube.com/c/", "youtube.com/shorts/", "youtube.com/embed/", ] if any(kw in lower for kw in youtube_keywords): # Determine suggested name if "playlist" in lower: name = "youtube_playlist" elif "/@" in lower or "/channel/" in lower or "/c/" in lower: name = "youtube_channel" else: name = "youtube_video" return SourceInfo( type="video", parsed={"url": source, "source_kind": "url"}, suggested_name=name, raw_input=source, ) # Vimeo patterns if "vimeo.com/" in lower: return SourceInfo( type="video", parsed={"url": source, "source_kind": "url"}, suggested_name="vimeo_video", raw_input=source, ) return None @classmethod def _detect_local(cls, source: str) -> SourceInfo: """Detect local directory source.""" # Clean up path directory = os.path.abspath(source) name = os.path.basename(directory) return SourceInfo( type="local", parsed={"directory": directory}, suggested_name=name, raw_input=source ) @classmethod def _detect_github(cls, source: str) -> SourceInfo | None: """Detect GitHub repository source. Supports patterns: - owner/repo - github.com/owner/repo - https://github.com/owner/repo """ # Try simple owner/repo pattern first match = cls.GITHUB_REPO_PATTERN.match(source) if match: owner, repo = match.groups() return SourceInfo( type="github", parsed={"repo": f"{owner}/{repo}"}, suggested_name=repo, raw_input=source, ) # Try GitHub URL pattern match = cls.GITHUB_URL_PATTERN.search(source) if match: owner, repo = match.groups() # Clean up repo name (remove .git suffix if present) if repo.endswith(".git"): repo = repo[:-4] return SourceInfo( type="github", parsed={"repo": f"{owner}/{repo}"}, suggested_name=repo, raw_input=source, ) return None @classmethod def _detect_web(cls, source: str) -> SourceInfo: """Detect web documentation source.""" # Parse URL to extract domain for suggested name parsed_url = urlparse(source) domain = parsed_url.netloc or parsed_url.path # Clean up domain for name suggestion # docs.react.dev -> react # reactjs.org -> react name = domain.replace("www.", "").replace("docs.", "") name = name.split(".")[0] # Take first part before TLD return SourceInfo(type="web", parsed={"url": source}, suggested_name=name, raw_input=source) @classmethod def validate_source(cls, source_info: SourceInfo) -> None: """Validate that source is accessible. Args: source_info: Detected source information Raises: ValueError: If source is not accessible """ if source_info.type == "local": directory = source_info.parsed["directory"] if not os.path.exists(directory): raise ValueError(f"Directory does not exist: {directory}") if not os.path.isdir(directory): raise ValueError(f"Path is not a directory: {directory}") elif source_info.type == "pdf": file_path = source_info.parsed["file_path"] if not os.path.exists(file_path): raise ValueError(f"PDF file does not exist: {file_path}") if not os.path.isfile(file_path): raise ValueError(f"Path is not a file: {file_path}") elif source_info.type == "word": file_path = source_info.parsed["file_path"] if not os.path.exists(file_path): raise ValueError(f"Word document does not exist: {file_path}") if not os.path.isfile(file_path): raise ValueError(f"Path is not a file: {file_path}") elif source_info.type == "epub": file_path = source_info.parsed["file_path"] if not os.path.exists(file_path): raise ValueError(f"EPUB file does not exist: {file_path}") if not os.path.isfile(file_path): raise ValueError(f"Path is not a file: {file_path}") elif source_info.type == "video": if source_info.parsed.get("source_kind") == "file": file_path = source_info.parsed["file_path"] if not os.path.exists(file_path): raise ValueError(f"Video file does not exist: {file_path}") if not os.path.isfile(file_path): raise ValueError(f"Path is not a file: {file_path}") # URL-based video sources are validated during processing elif source_info.type == "config": config_path = source_info.parsed["config_path"] if not os.path.exists(config_path): raise ValueError(f"Config file does not exist: {config_path}") if not os.path.isfile(config_path): raise ValueError(f"Path is not a file: {config_path}") elif source_info.type in ("jupyter", "html", "pptx", "asciidoc", "manpage", "openapi"): file_path = source_info.parsed.get("file_path", "") if file_path: type_label = source_info.type.upper() if not os.path.exists(file_path): raise ValueError(f"{type_label} file does not exist: {file_path}") if not os.path.isfile(file_path) and not os.path.isdir(file_path): raise ValueError(f"Path is not a file or directory: {file_path}") elif source_info.type == "rss": file_path = source_info.parsed.get("file_path", "") if file_path and not os.path.exists(file_path): raise ValueError(f"RSS/Atom file does not exist: {file_path}") # For web, github, confluence, notion, chat, rss (URL), validation happens # during scraping (URL accessibility, API auth, etc.)