diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index 36f2e7c..1cbe67c 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -485,8 +485,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]: structure = { "title": doc.title, "headers": [ - {"level": h.level, "text": h.text, "line": h.source_line} - for h in doc.headings + {"level": h.level, "text": h.text, "line": h.source_line} for h in doc.headings ], "code_blocks": [ { @@ -508,12 +507,10 @@ def extract_rst_structure(content: str) -> dict[str, Any]: for t in doc.tables ], "links": [ - {"text": x.text or x.target, "url": x.target} - for x in doc.external_links + {"text": x.text or x.target, "url": x.target} for x in doc.external_links ], "cross_references": [ - {"type": x.ref_type.value, "target": x.target} - for x in doc.internal_links + {"type": x.ref_type.value, "target": x.target} for x in doc.internal_links ], "word_count": len(content.split()), "line_count": len(content.split("\n")), @@ -569,7 +566,9 @@ def extract_rst_structure(content: str) -> dict[str, Any]: structure["title"] = text # Basic code block extraction - code_block_pattern = re.compile(r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL) + code_block_pattern = re.compile( + r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL + ) for match in code_block_pattern.finditer(content): language = match.group(1) or "text" code = match.group(2).strip() @@ -585,9 +584,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]: # Basic link extraction link_pattern = re.compile(r"`([^<`]+)\s+<([^>]+)>`_") for match in link_pattern.finditer(content): - structure["links"].append( - {"text": match.group(1).strip(), "url": match.group(2)} - ) + structure["links"].append({"text": match.group(1).strip(), "url": match.group(2)}) return structure @@ -729,8 +726,12 @@ def process_markdown_docs( ], "tables": len(parsed_doc.tables), "cross_refs": len(parsed_doc.internal_links), - "directives": len([b for b in parsed_doc.blocks if b.type.value == "admonition"]), - "word_count": parsed_doc.stats.total_blocks if parsed_doc.stats else 0, + "directives": len( + [b for b in parsed_doc.blocks if b.type.value == "admonition"] + ), + "word_count": parsed_doc.stats.total_blocks + if parsed_doc.stats + else 0, "line_count": len(content.split("\n")), } else: @@ -752,7 +753,9 @@ def process_markdown_docs( "tables": len(parsed_doc.tables), "images": len(parsed_doc.images), "links": len(parsed_doc.external_links), - "word_count": parsed_doc.stats.total_blocks if parsed_doc.stats else 0, + "word_count": parsed_doc.stats.total_blocks + if parsed_doc.stats + else 0, "line_count": len(content.split("\n")), } except ImportError: @@ -789,10 +792,15 @@ def process_markdown_docs( "tables": len(parsed_doc.tables), "cross_references": len(parsed_doc.internal_links), "code_blocks": len(parsed_doc.code_blocks), - "images": len(getattr(parsed_doc, 'images', [])), + "images": len(getattr(parsed_doc, "images", [])), "quality_scores": { - "avg_code_quality": sum(cb.quality_score or 0 for cb in parsed_doc.code_blocks) / len(parsed_doc.code_blocks) if parsed_doc.code_blocks else 0, - } + "avg_code_quality": sum( + cb.quality_score or 0 for cb in parsed_doc.code_blocks + ) + / len(parsed_doc.code_blocks) + if parsed_doc.code_blocks + else 0, + }, } processed_docs.append(doc_data) @@ -850,8 +858,12 @@ def process_markdown_docs( enhanced_count = sum(1 for doc in processed_docs if doc.get("_enhanced", False)) if enhanced_count > 0: total_tables = sum(doc.get("parsed_data", {}).get("tables", 0) for doc in processed_docs) - total_xrefs = sum(doc.get("parsed_data", {}).get("cross_references", 0) for doc in processed_docs) - total_code_blocks = sum(doc.get("parsed_data", {}).get("code_blocks", 0) for doc in processed_docs) + total_xrefs = sum( + doc.get("parsed_data", {}).get("cross_references", 0) for doc in processed_docs + ) + total_code_blocks = sum( + doc.get("parsed_data", {}).get("code_blocks", 0) for doc in processed_docs + ) extraction_summary = { "enhanced_files": enhanced_count, diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 83caace..3981303 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -426,8 +426,7 @@ class DocToSkillConverter: "url": url, "title": doc.title or "", "content": "\n\n".join( - p for p in doc._extract_content_text().split("\n\n") - if len(p.strip()) >= 20 + p for p in doc._extract_content_text().split("\n\n") if len(p.strip()) >= 20 ), "headings": [ {"level": f"h{h.level}", "text": h.text, "id": h.id or ""} @@ -2309,9 +2308,7 @@ def execute_enhancement(config: dict[str, Any], args: argparse.Namespace, conver # Check if workflow was already executed (for logging context) workflow_executed = ( - converter - and hasattr(converter, 'workflow_executed') - and converter.workflow_executed + converter and hasattr(converter, "workflow_executed") and converter.workflow_executed ) workflow_name = converter.workflow_name if workflow_executed else None @@ -2328,7 +2325,9 @@ def execute_enhancement(config: dict[str, Any], args: argparse.Namespace, conver logger.info("=" * 80) if workflow_executed: logger.info(f" Running after workflow: {workflow_name}") - logger.info(" (Workflow provides specialized analysis, enhancement provides general improvements)") + logger.info( + " (Workflow provides specialized analysis, enhancement provides general improvements)" + ) logger.info("") try: diff --git a/src/skill_seekers/cli/enhancement_workflow.py b/src/skill_seekers/cli/enhancement_workflow.py index 85886e3..3658a17 100644 --- a/src/skill_seekers/cli/enhancement_workflow.py +++ b/src/skill_seekers/cli/enhancement_workflow.py @@ -197,9 +197,7 @@ class WorkflowEngine: extends=data.get("extends"), ) - def _merge_workflows( - self, parent: EnhancementWorkflow, child_data: dict - ) -> dict: + def _merge_workflows(self, parent: EnhancementWorkflow, child_data: dict) -> dict: """Merge child workflow with parent (inheritance).""" # Start with parent as dict merged = { @@ -239,12 +237,8 @@ class WorkflowEngine: parent_post = parent.post_process child_post = child_data.get("post_process", {}) merged["post_process"] = { - "remove_sections": child_post.get( - "remove_sections", parent_post.remove_sections - ), - "reorder_sections": child_post.get( - "reorder_sections", parent_post.reorder_sections - ), + "remove_sections": child_post.get("remove_sections", parent_post.remove_sections), + "reorder_sections": child_post.get("reorder_sections", parent_post.reorder_sections), "add_metadata": { **parent_post.add_metadata, **child_post.get("add_metadata", {}), @@ -285,9 +279,7 @@ class WorkflowEngine: logger.info(f"šŸ”„ Running stage {idx}/{len(self.workflow.stages)}: {stage.name}") # Build stage context - stage_context = self._build_stage_context( - stage, current_results, context - ) + stage_context = self._build_stage_context(stage, current_results, context) # Run stage try: @@ -408,9 +400,7 @@ class WorkflowEngine: return result - def _merge_stage_results( - self, current: dict, stage_results: dict, target: str - ) -> dict: + def _merge_stage_results(self, current: dict, stage_results: dict, target: str) -> dict: """Merge stage results into current results.""" if target == "all": # Merge everything diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index 421897c..baf2c8f 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -1454,7 +1454,9 @@ def main(): logger.info("=" * 80) if workflow_executed: logger.info(f" Running after workflow: {workflow_name}") - logger.info(" (Workflow provides specialized analysis, enhancement provides general improvements)") + logger.info( + " (Workflow provides specialized analysis, enhancement provides general improvements)" + ) logger.info("") if api_key: @@ -1491,7 +1493,9 @@ def main(): logger.info(f" skill-seekers enhance {skill_dir}/ --enhance-level 2") logger.info(" (auto-detects API vs LOCAL mode based on ANTHROPIC_API_KEY)") logger.info("\nšŸ’” Or use a workflow:") - logger.info(f" skill-seekers github --repo {config['repo']} --enhance-workflow architecture-comprehensive") + logger.info( + f" skill-seekers github --repo {config['repo']} --enhance-workflow architecture-comprehensive" + ) logger.info(f"\nNext step: skill-seekers package {skill_dir}/") diff --git a/src/skill_seekers/cli/parsers/extractors/base_parser.py b/src/skill_seekers/cli/parsers/extractors/base_parser.py index 362aa8d..beb0f27 100644 --- a/src/skill_seekers/cli/parsers/extractors/base_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/base_parser.py @@ -20,6 +20,7 @@ logger = logging.getLogger(__name__) @dataclass class ParseResult: """Result of parsing a document.""" + document: Document | None = None success: bool = False errors: list[str] = field(default_factory=list) @@ -56,11 +57,11 @@ class BaseParser(ABC): - encoding: str = 'utf-8' """ self.options = options or {} - self._include_comments = self.options.get('include_comments', False) - self._extract_metadata = self.options.get('extract_metadata', True) - self._quality_scoring = self.options.get('quality_scoring', True) - self._max_file_size = self.options.get('max_file_size_mb', 50.0) * 1024 * 1024 - self._encoding = self.options.get('encoding', 'utf-8') + self._include_comments = self.options.get("include_comments", False) + self._extract_metadata = self.options.get("extract_metadata", True) + self._quality_scoring = self.options.get("quality_scoring", True) + self._max_file_size = self.options.get("max_file_size_mb", 50.0) * 1024 * 1024 + self._encoding = self.options.get("encoding", "utf-8") @property @abstractmethod @@ -149,15 +150,19 @@ class BaseParser(ABC): def parse_string(self, content: str, source_path: str = "") -> ParseResult: """Parse content from string.""" + # Create a wrapper that looks like a path class StringSource: def __init__(self, content: str, path: str): self._content = content self._path = path - def read_text(self, encoding: str = 'utf-8') -> str: + + def read_text(self, encoding: str = "utf-8") -> str: return self._content + def exists(self) -> bool: return True + def __str__(self): return self._path @@ -238,17 +243,20 @@ class BaseParser(ABC): document.stats.code_blocks = len(document.code_blocks) document.stats.tables = len(document.tables) document.stats.headings = len(document.headings) - document.stats.cross_references = len(document.internal_links) + len(document.external_links) + document.stats.cross_references = len(document.internal_links) + len( + document.external_links + ) return document def _extract_headings(self, document: Document) -> list: """Extract headings from content blocks.""" from .unified_structure import ContentBlockType + headings = [] for block in document.blocks: if block.type == ContentBlockType.HEADING: - heading_data = block.metadata.get('heading_data') + heading_data = block.metadata.get("heading_data") if heading_data: headings.append(heading_data) return headings @@ -257,22 +265,23 @@ class BaseParser(ABC): """Extract code blocks from content blocks.""" code_blocks = [] for block in document.blocks: - if block.metadata.get('code_data'): - code_blocks.append(block.metadata['code_data']) + if block.metadata.get("code_data"): + code_blocks.append(block.metadata["code_data"]) return code_blocks def _extract_tables(self, document: Document) -> list: """Extract tables from content blocks.""" tables = [] for block in document.blocks: - if block.metadata.get('table_data'): - tables.append(block.metadata['table_data']) + if block.metadata.get("table_data"): + tables.append(block.metadata["table_data"]) return tables def _create_quality_scorer(self): """Create a quality scorer if enabled.""" if self._quality_scoring: from .quality_scorer import QualityScorer + return QualityScorer() return None @@ -292,12 +301,14 @@ def get_parser_for_file(path: str | Path) -> BaseParser | None: # Try RST parser from .rst_parser import RstParser + rst_parser = RstParser() if suffix in rst_parser.supported_extensions: return rst_parser # Try Markdown parser from .markdown_parser import MarkdownParser + md_parser = MarkdownParser() if suffix in md_parser.supported_extensions: return md_parser @@ -320,11 +331,13 @@ def parse_document(source: str | Path, format_hint: str | None = None) -> ParseR """ # Use format hint if provided if format_hint: - if format_hint.lower() in ('rst', 'rest', 'restructuredtext'): + if format_hint.lower() in ("rst", "rest", "restructuredtext"): from .rst_parser import RstParser + return RstParser().parse(source) - elif format_hint.lower() in ('md', 'markdown'): + elif format_hint.lower() in ("md", "markdown"): from .markdown_parser import MarkdownParser + return MarkdownParser().parse(source) # Auto-detect from file extension @@ -336,11 +349,13 @@ def parse_document(source: str | Path, format_hint: str | None = None) -> ParseR content = source if isinstance(source, str) else Path(source).read_text() # Check for RST indicators - rst_indicators = ['.. ', '::\n', ':ref:`', '.. toctree::', '.. code-block::'] + rst_indicators = [".. ", "::\n", ":ref:`", ".. toctree::", ".. code-block::"] if any(ind in content for ind in rst_indicators): from .rst_parser import RstParser + return RstParser().parse_string(content) # Default to Markdown from .markdown_parser import MarkdownParser + return MarkdownParser().parse_string(content) diff --git a/src/skill_seekers/cli/parsers/extractors/formatters.py b/src/skill_seekers/cli/parsers/extractors/formatters.py index 5f4cc4e..5dd2261 100644 --- a/src/skill_seekers/cli/parsers/extractors/formatters.py +++ b/src/skill_seekers/cli/parsers/extractors/formatters.py @@ -7,7 +7,12 @@ Convert unified Document structure to various output formats. from typing import Any from .unified_structure import ( - Document, ContentBlock, ContentBlockType, AdmonitionType, ListType, Table + Document, + ContentBlock, + ContentBlockType, + AdmonitionType, + ListType, + Table, ) @@ -16,10 +21,10 @@ class MarkdownFormatter: def __init__(self, options: dict[str, Any] = None): self.options = options or {} - self.include_toc = self.options.get('include_toc', False) - self.max_heading_level = self.options.get('max_heading_level', 6) - self.code_block_style = self.options.get('code_block_style', 'fenced') - self.table_style = self.options.get('table_style', 'github') + self.include_toc = self.options.get("include_toc", False) + self.max_heading_level = self.options.get("max_heading_level", 6) + self.code_block_style = self.options.get("code_block_style", "fenced") + self.table_style = self.options.get("table_style", "github") def format(self, document: Document) -> str: """Convert document to markdown string.""" @@ -43,11 +48,11 @@ class MarkdownFormatter: if formatted: parts.append(formatted) - return '\n'.join(parts) + return "\n".join(parts) def _format_metadata(self, meta: dict) -> str: """Format metadata as YAML frontmatter.""" - lines = ['---'] + lines = ["---"] for key, value in meta.items(): if isinstance(value, list): lines.append(f"{key}:") @@ -55,19 +60,19 @@ class MarkdownFormatter: lines.append(f" - {item}") else: lines.append(f"{key}: {value}") - lines.append('---\n') - return '\n'.join(lines) + lines.append("---\n") + return "\n".join(lines) def _format_toc(self, headings: list) -> str: """Format table of contents.""" - lines = ['## Table of Contents\n'] + lines = ["## Table of Contents\n"] for h in headings: if h.level <= self.max_heading_level: - indent = ' ' * (h.level - 1) - anchor = h.id or h.text.lower().replace(' ', '-') + indent = " " * (h.level - 1) + anchor = h.id or h.text.lower().replace(" ", "-") lines.append(f"{indent}- [{h.text}](#{anchor})") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_block(self, block: ContentBlock) -> str: """Format a single content block.""" @@ -91,16 +96,16 @@ class MarkdownFormatter: return handler(block) # Default: return content as-is - return block.content + '\n' + return block.content + "\n" def _format_heading(self, block: ContentBlock) -> str: """Format heading block.""" - heading_data = block.metadata.get('heading_data') + heading_data = block.metadata.get("heading_data") if heading_data: level = min(heading_data.level, 6) text = heading_data.text else: - level = block.metadata.get('level', 1) + level = block.metadata.get("level", 1) text = block.content if level > self.max_heading_level: @@ -110,38 +115,38 @@ class MarkdownFormatter: def _format_paragraph(self, block: ContentBlock) -> str: """Format paragraph block.""" - return block.content + '\n' + return block.content + "\n" def _format_code_block(self, block: ContentBlock) -> str: """Format code block.""" - code_data = block.metadata.get('code_data') + code_data = block.metadata.get("code_data") if code_data: code = code_data.code - lang = code_data.language or '' + lang = code_data.language or "" else: code = block.content - lang = block.metadata.get('language', '') + lang = block.metadata.get("language", "") - if self.code_block_style == 'fenced': + if self.code_block_style == "fenced": return f"```{lang}\n{code}\n```\n" else: # Indented style - indented = '\n'.join(' ' + line for line in code.split('\n')) - return indented + '\n' + indented = "\n".join(" " + line for line in code.split("\n")) + return indented + "\n" def _format_table(self, block: ContentBlock) -> str: """Format table block.""" - table_data = block.metadata.get('table_data') + table_data = block.metadata.get("table_data") if not table_data: - return '' + return "" return self._format_table_data(table_data) def _format_table_data(self, table: Table) -> str: """Format table data as markdown.""" if not table.rows: - return '' + return "" lines = [] @@ -151,92 +156,92 @@ class MarkdownFormatter: # Headers headers = table.headers or table.rows[0] - lines.append('| ' + ' | '.join(headers) + ' |') - lines.append('|' + '|'.join('---' for _ in headers) + '|') + lines.append("| " + " | ".join(headers) + " |") + lines.append("|" + "|".join("---" for _ in headers) + "|") # Rows (skip first if used as headers) start_row = 0 if table.headers else 1 for row in table.rows[start_row:]: # Pad row to match header count - padded_row = row + [''] * (len(headers) - len(row)) - lines.append('| ' + ' | '.join(padded_row[:len(headers)]) + ' |') + padded_row = row + [""] * (len(headers) - len(row)) + lines.append("| " + " | ".join(padded_row[: len(headers)]) + " |") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_list(self, block: ContentBlock) -> str: """Format list block.""" - list_type = block.metadata.get('list_type', ListType.BULLET) - items = block.metadata.get('items', []) + list_type = block.metadata.get("list_type", ListType.BULLET) + items = block.metadata.get("items", []) if not items: - return block.content + '\n' + return block.content + "\n" lines = [] for i, item in enumerate(items): prefix = f"{i + 1}." if list_type == ListType.NUMBERED else "-" lines.append(f"{prefix} {item}") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_image(self, block: ContentBlock) -> str: """Format image block.""" - image_data = block.metadata.get('image_data') + image_data = block.metadata.get("image_data") if image_data: src = image_data.source - alt = image_data.alt_text or '' + alt = image_data.alt_text or "" else: - src = block.metadata.get('src', '') - alt = block.metadata.get('alt', '') + src = block.metadata.get("src", "") + alt = block.metadata.get("alt", "") return f"![{alt}]({src})\n" def _format_cross_ref(self, block: ContentBlock) -> str: """Format cross-reference block.""" - xref_data = block.metadata.get('xref_data') + xref_data = block.metadata.get("xref_data") if xref_data: text = xref_data.text or xref_data.target target = xref_data.target return f"[{text}](#{target})\n" - return block.content + '\n' + return block.content + "\n" def _format_admonition(self, block: ContentBlock) -> str: """Format admonition/callout block.""" - admonition_type = block.metadata.get('admonition_type', AdmonitionType.NOTE) + admonition_type = block.metadata.get("admonition_type", AdmonitionType.NOTE) # GitHub-style admonitions type_map = { - AdmonitionType.NOTE: 'NOTE', - AdmonitionType.WARNING: 'WARNING', - AdmonitionType.TIP: 'TIP', - AdmonitionType.IMPORTANT: 'IMPORTANT', - AdmonitionType.CAUTION: 'CAUTION', + AdmonitionType.NOTE: "NOTE", + AdmonitionType.WARNING: "WARNING", + AdmonitionType.TIP: "TIP", + AdmonitionType.IMPORTANT: "IMPORTANT", + AdmonitionType.CAUTION: "CAUTION", } - type_str = type_map.get(admonition_type, 'NOTE') + type_str = type_map.get(admonition_type, "NOTE") content = block.content return f"> [!{type_str}]\n> {content.replace(chr(10), chr(10) + '> ')}\n" def _format_directive(self, block: ContentBlock) -> str: """Format directive block (RST-specific).""" - directive_name = block.metadata.get('directive_name', 'unknown') + directive_name = block.metadata.get("directive_name", "unknown") # Format as a blockquote with directive name content = block.content lines = [f"> **{directive_name}**"] - for line in content.split('\n'): + for line in content.split("\n"): lines.append(f"> {line}") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_field_list(self, block: ContentBlock) -> str: """Format field list block.""" - fields = block.metadata.get('fields', []) + fields = block.metadata.get("fields", []) if not fields: - return block.content + '\n' + return block.content + "\n" lines = [] for field in fields: @@ -244,14 +249,14 @@ class MarkdownFormatter: lines.append(f"**{field.name}** (`{field.arg}`): {field.content}") else: lines.append(f"**{field.name}**: {field.content}") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_definition_list(self, block: ContentBlock) -> str: """Format definition list block.""" - items = block.metadata.get('items', []) + items = block.metadata.get("items", []) if not items: - return block.content + '\n' + return block.content + "\n" lines = [] for item in items: @@ -260,12 +265,12 @@ class MarkdownFormatter: else: lines.append(f"**{item.term}**") lines.append(f": {item.definition}") - lines.append('') - return '\n'.join(lines) + lines.append("") + return "\n".join(lines) def _format_meta(self, block: ContentBlock) -> str: """Format metadata block (usually filtered out).""" - return '' # Metadata goes in YAML frontmatter + return "" # Metadata goes in YAML frontmatter class SkillFormatter: @@ -278,10 +283,7 @@ class SkillFormatter: "source_path": document.source_path, "format": document.format, "content_summary": self._extract_summary(document), - "headings": [ - {"level": h.level, "text": h.text, "id": h.id} - for h in document.headings - ], + "headings": [{"level": h.level, "text": h.text, "id": h.id} for h in document.headings], "code_samples": [ { "code": cb.code, @@ -318,7 +320,7 @@ class SkillFormatter: "headings": document.stats.headings, "cross_references": document.stats.cross_references, "processing_time_ms": document.stats.processing_time_ms, - } + }, } def _extract_summary(self, document: Document, max_length: int = 500) -> str: @@ -327,12 +329,12 @@ class SkillFormatter: for block in document.blocks: if block.type == ContentBlockType.PARAGRAPH: paragraphs.append(block.content) - if len(' '.join(paragraphs)) > max_length: + if len(" ".join(paragraphs)) > max_length: break - summary = ' '.join(paragraphs) + summary = " ".join(paragraphs) if len(summary) > max_length: - summary = summary[:max_length - 3] + '...' + summary = summary[: max_length - 3] + "..." return summary diff --git a/src/skill_seekers/cli/parsers/extractors/markdown_parser.py b/src/skill_seekers/cli/parsers/extractors/markdown_parser.py index e357569..5da8404 100644 --- a/src/skill_seekers/cli/parsers/extractors/markdown_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/markdown_parser.py @@ -21,8 +21,17 @@ from typing import Any from .base_parser import BaseParser from .unified_structure import ( - Document, ContentBlock, ContentBlockType, CrossReference, CrossRefType, - AdmonitionType, Heading, CodeBlock, Table, Image, ListType + Document, + ContentBlock, + ContentBlockType, + CrossReference, + CrossRefType, + AdmonitionType, + Heading, + CodeBlock, + Table, + Image, + ListType, ) from .quality_scorer import QualityScorer @@ -36,14 +45,14 @@ class MarkdownParser(BaseParser): # Admonition types for GitHub-style callouts ADMONITION_TYPES = { - 'note': AdmonitionType.NOTE, - 'warning': AdmonitionType.WARNING, - 'tip': AdmonitionType.TIP, - 'hint': AdmonitionType.HINT, - 'important': AdmonitionType.IMPORTANT, - 'caution': AdmonitionType.CAUTION, - 'danger': AdmonitionType.DANGER, - 'attention': AdmonitionType.ATTENTION, + "note": AdmonitionType.NOTE, + "warning": AdmonitionType.WARNING, + "tip": AdmonitionType.TIP, + "hint": AdmonitionType.HINT, + "important": AdmonitionType.IMPORTANT, + "caution": AdmonitionType.CAUTION, + "danger": AdmonitionType.DANGER, + "attention": AdmonitionType.ATTENTION, } def __init__(self, options: dict[str, Any] | None = None): @@ -54,32 +63,32 @@ class MarkdownParser(BaseParser): @property def format_name(self) -> str: - return 'markdown' + return "markdown" @property def supported_extensions(self) -> list[str]: - return ['.md', '.markdown', '.mdown', '.mkd'] + return [".md", ".markdown", ".mdown", ".mkd"] def _detect_format(self, content: str) -> bool: """Detect if content is Markdown.""" md_indicators = [ - r'^#{1,6}\s+\S', # ATX headers - r'^\[.*?\]\(.*?\)', # Links - r'^```', # Code fences - r'^\|.+\|', # Tables - r'^\s*[-*+]\s+\S', # Lists - r'^>\s+\S', # Blockquotes + r"^#{1,6}\s+\S", # ATX headers + r"^\[.*?\]\(.*?\)", # Links + r"^```", # Code fences + r"^\|.+\|", # Tables + r"^\s*[-*+]\s+\S", # Lists + r"^>\s+\S", # Blockquotes ] return any(re.search(pattern, content, re.MULTILINE) for pattern in md_indicators) def _parse_content(self, content: str, source_path: str) -> Document: """Parse Markdown content into Document.""" - self._lines = content.split('\n') + self._lines = content.split("\n") self._current_line = 0 document = Document( - title='', - format='markdown', + title="", + format="markdown", source_path=source_path, ) @@ -96,12 +105,12 @@ class MarkdownParser(BaseParser): self._current_line += 1 # Extract title from first h1 or frontmatter - if document.meta.get('title'): - document.title = document.meta['title'] + if document.meta.get("title"): + document.title = document.meta["title"] else: for block in document.blocks: if block.type == ContentBlockType.HEADING: - heading_data = block.metadata.get('heading_data') + heading_data = block.metadata.get("heading_data") if heading_data and heading_data.level == 1: document.title = heading_data.text break @@ -117,13 +126,13 @@ class MarkdownParser(BaseParser): return None first_line = self._lines[self._current_line].strip() - if first_line != '---': + if first_line != "---": return None # Find closing --- end_line = None for i in range(self._current_line + 1, len(self._lines)): - if self._lines[i].strip() == '---': + if self._lines[i].strip() == "---": end_line = i break @@ -131,8 +140,8 @@ class MarkdownParser(BaseParser): return None # Extract frontmatter content - frontmatter_lines = self._lines[self._current_line + 1:end_line] - '\n'.join(frontmatter_lines) + frontmatter_lines = self._lines[self._current_line + 1 : end_line] + "\n".join(frontmatter_lines) # Simple key: value parsing (not full YAML) meta = {} @@ -145,11 +154,11 @@ class MarkdownParser(BaseParser): continue # Check for new key - match = re.match(r'^(\w+):\s*(.*)$', stripped) + match = re.match(r"^(\w+):\s*(.*)$", stripped) if match: # Save previous key if current_key: - meta[current_key] = '\n'.join(current_value).strip() + meta[current_key] = "\n".join(current_value).strip() current_key = match.group(1) value = match.group(2) @@ -157,27 +166,27 @@ class MarkdownParser(BaseParser): # Handle inline value if value: # Check if it's a list - if value.startswith('[') and value.endswith(']'): + if value.startswith("[") and value.endswith("]"): # Parse list - items = [item.strip().strip('"\'') for item in value[1:-1].split(',')] + items = [item.strip().strip("\"'") for item in value[1:-1].split(",")] meta[current_key] = items else: current_value = [value] else: current_value = [] - elif current_key and stripped.startswith('- '): + elif current_key and stripped.startswith("- "): # List item if current_key not in meta: meta[current_key] = [] if not isinstance(meta[current_key], list): meta[current_key] = [meta[current_key]] - meta[current_key].append(stripped[2:].strip().strip('"\'')) + meta[current_key].append(stripped[2:].strip().strip("\"'")) elif current_key: current_value.append(stripped) # Save last key if current_key: - meta[current_key] = '\n'.join(current_value).strip() + meta[current_key] = "\n".join(current_value).strip() # Advance past frontmatter self._current_line = end_line + 1 @@ -198,11 +207,11 @@ class MarkdownParser(BaseParser): return None # Skip HTML comments - if stripped.startswith('' in line: + if "-->" in line: break self._current_line += 1 @@ -531,16 +540,16 @@ class MarkdownParser(BaseParser): """Parse horizontal rule.""" return ContentBlock( type=ContentBlockType.RAW, - content='---', - metadata={'element': 'horizontal_rule'}, + content="---", + metadata={"element": "horizontal_rule"}, source_line=self._current_line + 1, ) def _detect_list_type(self, stripped: str) -> ListType | None: """Detect if line starts a list and which type.""" - if re.match(r'^[-*+]\s+', stripped): + if re.match(r"^[-*+]\s+", stripped): return ListType.BULLET - if re.match(r'^\d+\.\s+', stripped): + if re.match(r"^\d+\.\s+", stripped): return ListType.NUMBERED return None @@ -559,13 +568,13 @@ class MarkdownParser(BaseParser): # Check if still in list if list_type == ListType.BULLET: - match = re.match(r'^[-*+]\s+(.+)$', stripped) + match = re.match(r"^[-*+]\s+(.+)$", stripped) if not match: self._current_line -= 1 break items.append(match.group(1)) else: # NUMBERED - match = re.match(r'^\d+\.\s+(.+)$', stripped) + match = re.match(r"^\d+\.\s+(.+)$", stripped) if not match: self._current_line -= 1 break @@ -577,8 +586,8 @@ class MarkdownParser(BaseParser): type=ContentBlockType.LIST, content=f"{len(items)} items", metadata={ - 'list_type': list_type, - 'items': items, + "list_type": list_type, + "items": items, }, source_line=start_line + 1, ) @@ -597,15 +606,15 @@ class MarkdownParser(BaseParser): break # Check for block-level elements - if stripped.startswith('#'): + if stripped.startswith("#"): break - if stripped.startswith('```'): + if stripped.startswith("```"): break - if stripped.startswith('>'): + if stripped.startswith(">"): break - if stripped.startswith('---') or stripped.startswith('***'): + if stripped.startswith("---") or stripped.startswith("***"): break - if stripped.startswith('|') and self._is_table(self._current_line): + if stripped.startswith("|") and self._is_table(self._current_line): break if self._detect_list_type(stripped): break @@ -615,7 +624,7 @@ class MarkdownParser(BaseParser): lines.append(stripped) self._current_line += 1 - content = ' '.join(lines) + content = " ".join(lines) # Process inline elements content = self._process_inline(content) @@ -629,60 +638,60 @@ class MarkdownParser(BaseParser): def _process_inline(self, text: str) -> str: """Process inline Markdown elements.""" # Links [text](url) - text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\1](\2)', text) + text = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"[\1](\2)", text) # Images ![alt](url) - text = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', r'![\1](\2)', text) + text = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", r"![\1](\2)", text) # Code `code` - text = re.sub(r'`([^`]+)`', r'`\1`', text) + text = re.sub(r"`([^`]+)`", r"`\1`", text) # Bold **text** or __text__ - text = re.sub(r'\*\*([^*]+)\*\*', r'**\1**', text) - text = re.sub(r'__([^_]+)__', r'**\1**', text) + text = re.sub(r"\*\*([^*]+)\*\*", r"**\1**", text) + text = re.sub(r"__([^_]+)__", r"**\1**", text) # Italic *text* or _text_ - text = re.sub(r'(? str: """Create URL anchor from heading text.""" anchor = text.lower() - anchor = re.sub(r'[^\w\s-]', '', anchor) - anchor = anchor.replace(' ', '-') - anchor = re.sub(r'-+', '-', anchor) - return anchor.strip('-') + anchor = re.sub(r"[^\w\s-]", "", anchor) + anchor = anchor.replace(" ", "-") + anchor = re.sub(r"-+", "-", anchor) + return anchor.strip("-") def _extract_specialized_content(self, document: Document): """Extract specialized content lists from blocks.""" for block in document.blocks: # Extract headings if block.type == ContentBlockType.HEADING: - heading_data = block.metadata.get('heading_data') + heading_data = block.metadata.get("heading_data") if heading_data: document.headings.append(heading_data) # Extract code blocks elif block.type == ContentBlockType.CODE_BLOCK: - code_data = block.metadata.get('code_data') + code_data = block.metadata.get("code_data") if code_data: document.code_blocks.append(code_data) # Extract tables elif block.type == ContentBlockType.TABLE: - table_data = block.metadata.get('table_data') + table_data = block.metadata.get("table_data") if table_data: document.tables.append(table_data) # Extract images from paragraphs (simplified) elif block.type == ContentBlockType.PARAGRAPH: content = block.content - img_matches = re.findall(r'!\[([^\]]*)\]\(([^)]+)\)', content) + img_matches = re.findall(r"!\[([^\]]*)\]\(([^)]+)\)", content) for alt, src in img_matches: image = Image( source=src, @@ -692,12 +701,12 @@ class MarkdownParser(BaseParser): document.images.append(image) # Extract links - link_matches = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content) + link_matches = re.findall(r"\[([^\]]+)\]\(([^)]+)\)", content) for text, url in link_matches: # Determine if internal or external - if url.startswith('#'): + if url.startswith("#"): ref_type = CrossRefType.INTERNAL - elif url.startswith('http'): + elif url.startswith("http"): ref_type = CrossRefType.EXTERNAL else: ref_type = CrossRefType.INTERNAL diff --git a/src/skill_seekers/cli/parsers/extractors/pdf_parser.py b/src/skill_seekers/cli/parsers/extractors/pdf_parser.py index 4490b51..4b877d6 100644 --- a/src/skill_seekers/cli/parsers/extractors/pdf_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/pdf_parser.py @@ -25,6 +25,7 @@ try: except ImportError: # Fallback for relative import import sys + sys.path.insert(0, str(Path(__file__).parent.parent)) from pdf_extractor_poc import PDFExtractor @@ -75,9 +76,7 @@ class PdfParser(BaseParser): This method is mainly for API compatibility. """ # For PDF, we need to use parse_file - raise NotImplementedError( - "PDF parsing requires file path. Use parse_file() instead." - ) + raise NotImplementedError("PDF parsing requires file path. Use parse_file() instead.") def parse_file(self, path: str | Path) -> ParseResult: """ diff --git a/src/skill_seekers/cli/parsers/extractors/quality_scorer.py b/src/skill_seekers/cli/parsers/extractors/quality_scorer.py index 4c377d6..9602644 100644 --- a/src/skill_seekers/cli/parsers/extractors/quality_scorer.py +++ b/src/skill_seekers/cli/parsers/extractors/quality_scorer.py @@ -17,107 +17,133 @@ class QualityScorer: # Language patterns for detection and validation LANGUAGE_PATTERNS = { - 'python': { - 'keywords': ['def ', 'class ', 'import ', 'from ', 'return ', 'if ', 'for ', 'while'], - 'syntax_checks': [ - (r':\s*$', 'colon_ending'), # Python uses colons for blocks - (r'def\s+\w+\s*\([^)]*\)\s*:', 'function_def'), - (r'class\s+\w+', 'class_def'), + "python": { + "keywords": ["def ", "class ", "import ", "from ", "return ", "if ", "for ", "while"], + "syntax_checks": [ + (r":\s*$", "colon_ending"), # Python uses colons for blocks + (r"def\s+\w+\s*\([^)]*\)\s*:", "function_def"), + (r"class\s+\w+", "class_def"), ], }, - 'javascript': { - 'keywords': ['function', 'const ', 'let ', 'var ', '=>', 'return ', 'if(', 'for('], - 'syntax_checks': [ - (r'function\s+\w+\s*\(', 'function_def'), - (r'const\s+\w+\s*=', 'const_decl'), - (r'=>', 'arrow_function'), + "javascript": { + "keywords": ["function", "const ", "let ", "var ", "=>", "return ", "if(", "for("], + "syntax_checks": [ + (r"function\s+\w+\s*\(", "function_def"), + (r"const\s+\w+\s*=", "const_decl"), + (r"=>", "arrow_function"), ], }, - 'typescript': { - 'keywords': ['interface ', 'type ', ': string', ': number', ': boolean', 'implements'], - 'syntax_checks': [ - (r'interface\s+\w+', 'interface_def'), - (r':\s*(string|number|boolean|any)', 'type_annotation'), + "typescript": { + "keywords": ["interface ", "type ", ": string", ": number", ": boolean", "implements"], + "syntax_checks": [ + (r"interface\s+\w+", "interface_def"), + (r":\s*(string|number|boolean|any)", "type_annotation"), ], }, - 'java': { - 'keywords': ['public ', 'private ', 'class ', 'void ', 'String ', 'int ', 'return '], - 'syntax_checks': [ - (r'public\s+class\s+\w+', 'class_def'), - (r'public\s+\w+\s+\w+\s*\(', 'method_def'), + "java": { + "keywords": ["public ", "private ", "class ", "void ", "String ", "int ", "return "], + "syntax_checks": [ + (r"public\s+class\s+\w+", "class_def"), + (r"public\s+\w+\s+\w+\s*\(", "method_def"), ], }, - 'cpp': { - 'keywords': ['#include', 'using namespace', 'std::', 'cout', 'cin', 'public:', 'private:'], - 'syntax_checks': [ - (r'#include\s*[<"]', 'include'), - (r'std::', 'std_namespace'), + "cpp": { + "keywords": [ + "#include", + "using namespace", + "std::", + "cout", + "cin", + "public:", + "private:", + ], + "syntax_checks": [ + (r'#include\s*[<"]', "include"), + (r"std::", "std_namespace"), ], }, - 'csharp': { - 'keywords': ['namespace ', 'public class', 'private ', 'void ', 'string ', 'int '], - 'syntax_checks': [ - (r'namespace\s+\w+', 'namespace'), - (r'public\s+class\s+\w+', 'class_def'), + "csharp": { + "keywords": ["namespace ", "public class", "private ", "void ", "string ", "int "], + "syntax_checks": [ + (r"namespace\s+\w+", "namespace"), + (r"public\s+class\s+\w+", "class_def"), ], }, - 'go': { - 'keywords': ['package ', 'func ', 'import ', 'return ', 'if ', 'for ', 'range '], - 'syntax_checks': [ - (r'func\s+\w+\s*\(', 'function_def'), - (r'package\s+\w+', 'package_decl'), + "go": { + "keywords": ["package ", "func ", "import ", "return ", "if ", "for ", "range "], + "syntax_checks": [ + (r"func\s+\w+\s*\(", "function_def"), + (r"package\s+\w+", "package_decl"), ], }, - 'rust': { - 'keywords': ['fn ', 'let ', 'mut ', 'impl ', 'struct ', 'enum ', 'match ', 'use '], - 'syntax_checks': [ - (r'fn\s+\w+\s*\(', 'function_def'), - (r'impl\s+\w+', 'impl_block'), + "rust": { + "keywords": ["fn ", "let ", "mut ", "impl ", "struct ", "enum ", "match ", "use "], + "syntax_checks": [ + (r"fn\s+\w+\s*\(", "function_def"), + (r"impl\s+\w+", "impl_block"), ], }, - 'gdscript': { # Godot - 'keywords': ['extends ', 'class_name ', 'func ', 'var ', 'const ', 'signal ', 'export', 'onready'], - 'syntax_checks': [ - (r'extends\s+\w+', 'extends'), - (r'func\s+_\w+', 'built_in_method'), - (r'signal\s+\w+', 'signal_def'), - (r'@export', 'export_annotation'), + "gdscript": { # Godot + "keywords": [ + "extends ", + "class_name ", + "func ", + "var ", + "const ", + "signal ", + "export", + "onready", + ], + "syntax_checks": [ + (r"extends\s+\w+", "extends"), + (r"func\s+_\w+", "built_in_method"), + (r"signal\s+\w+", "signal_def"), + (r"@export", "export_annotation"), ], }, - 'yaml': { - 'keywords': [], - 'syntax_checks': [ - (r'^\w+:\s*', 'key_value'), - (r'^-\s+\w+', 'list_item'), + "yaml": { + "keywords": [], + "syntax_checks": [ + (r"^\w+:\s*", "key_value"), + (r"^-\s+\w+", "list_item"), ], }, - 'json': { - 'keywords': [], - 'syntax_checks': [ - (r'["\']\w+["\']\s*:', 'key_value'), - (r'\{[^}]*\}', 'object'), - (r'\[[^\]]*\]', 'array'), + "json": { + "keywords": [], + "syntax_checks": [ + (r'["\']\w+["\']\s*:', "key_value"), + (r"\{[^}]*\}", "object"), + (r"\[[^\]]*\]", "array"), ], }, - 'xml': { - 'keywords': [], - 'syntax_checks': [ - (r'<\w+[^>]*>', 'opening_tag'), - (r'', 'closing_tag'), + "xml": { + "keywords": [], + "syntax_checks": [ + (r"<\w+[^>]*>", "opening_tag"), + (r"", "closing_tag"), ], }, - 'sql': { - 'keywords': ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'TABLE'], - 'syntax_checks': [ - (r'SELECT\s+.+\s+FROM', 'select_statement'), - (r'CREATE\s+TABLE', 'create_table'), + "sql": { + "keywords": [ + "SELECT", + "FROM", + "WHERE", + "INSERT", + "UPDATE", + "DELETE", + "CREATE", + "TABLE", + ], + "syntax_checks": [ + (r"SELECT\s+.+\s+FROM", "select_statement"), + (r"CREATE\s+TABLE", "create_table"), ], }, - 'bash': { - 'keywords': ['#!/bin/', 'echo ', 'if [', 'then', 'fi', 'for ', 'do', 'done'], - 'syntax_checks': [ - (r'#!/bin/\w+', 'shebang'), - (r'\$\w+', 'variable'), + "bash": { + "keywords": ["#!/bin/", "echo ", "if [", "then", "fi", "for ", "do", "done"], + "syntax_checks": [ + (r"#!/bin/\w+", "shebang"), + (r"\$\w+", "variable"), ], }, } @@ -139,7 +165,7 @@ class QualityScorer: return 0.0 code = code.strip() - lines = [line for line in code.split('\n') if line.strip()] + lines = [line for line in code.split("\n") if line.strip()] # Factor 1: Length appropriateness code_len = len(code) @@ -161,13 +187,14 @@ class QualityScorer: lang_patterns = self.LANGUAGE_PATTERNS[language] # Check for keywords - keyword_matches = sum(1 for kw in lang_patterns['keywords'] if kw in code) + keyword_matches = sum(1 for kw in lang_patterns["keywords"] if kw in code) if keyword_matches >= 2: score += 1.0 # Check for syntax patterns syntax_matches = sum( - 1 for pattern, _ in lang_patterns['syntax_checks'] + 1 + for pattern, _ in lang_patterns["syntax_checks"] if re.search(pattern, code, re.MULTILINE) ) if syntax_matches >= 1: @@ -175,11 +202,11 @@ class QualityScorer: # Factor 4: Structural quality # Check for function/class definitions - if re.search(r'\b(def|function|func|fn|class|public class)\b', code): + if re.search(r"\b(def|function|func|fn|class|public class)\b", code): score += 1.5 # Check for meaningful variable names (not just x, y, i) - meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower()) + meaningful_vars = re.findall(r"\b[a-z_][a-z0-9_]{3,}\b", code.lower()) if len(meaningful_vars) >= 3: score += 0.5 @@ -192,8 +219,7 @@ class QualityScorer: # Factor 6: Comment/code ratio comment_lines = sum( - 1 for line in lines - if line.strip().startswith(('#', '//', '/*', '*', '--', '