From 0878ad3ef68808cb7def165adadc4dd8780589c6 Mon Sep 17 00:00:00 2001 From: yusyus Date: Wed, 18 Feb 2026 22:44:41 +0300 Subject: [PATCH] fix: resolve all ruff linting errors (W293, F401, B904, UP007, UP045, E741, SIM102, SIM117, ARG) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-fixed (whitespace, imports, type annotations): - codebase_scraper.py: W293 blank lines with whitespace - doc_scraper.py: W293 blank lines with whitespace - parsers/extractors/__init__.py: W293 - parsers/extractors/base_parser.py: W293, UP007, UP045, F401 Manual fixes: - enhancement_workflow.py: B904 raise without `from exc`, remove unused `os` import - parsers/extractors/quality_scorer.py: E741 ambiguous var `l` → `line` - parsers/extractors/rst_parser.py: SIM102 nested if → combined conditions (x2) - pdf_scraper.py: F821 undefined `logger` → `print()` (consistent with file style) - mcp/tools/workflow_tools.py: ARG001 unused `args` → `_args` - tests/test_workflow_runner.py: ARG005 unused lambda args → `_a`/`_kw`, ARG001 `kwargs` → `_kwargs` - tests/test_workflows_command.py: SIM117 nested with → combined with (x2) All 1922 tests pass. Co-Authored-By: Claude Sonnet 4.5 --- src/skill_seekers/cli/codebase_scraper.py | 10 +- src/skill_seekers/cli/doc_scraper.py | 8 +- src/skill_seekers/cli/enhancement_workflow.py | 5 +- .../cli/parsers/extractors/__init__.py | 10 +- .../cli/parsers/extractors/base_parser.py | 132 +++--- .../cli/parsers/extractors/formatters.py | 114 +++--- .../cli/parsers/extractors/markdown_parser.py | 282 +++++++------ .../cli/parsers/extractors/pdf_parser.py | 27 +- .../cli/parsers/extractors/quality_scorer.py | 112 +++--- .../cli/parsers/extractors/rst_parser.py | 379 +++++++++--------- .../parsers/extractors/unified_structure.py | 128 +++--- src/skill_seekers/cli/pdf_scraper.py | 14 +- src/skill_seekers/cli/unified_enhancer.py | 7 +- src/skill_seekers/cli/unified_scraper.py | 2 +- src/skill_seekers/mcp/tools/workflow_tools.py | 2 +- tests/test_create_integration_basic.py | 44 +- tests/test_unified_parsers.py | 4 +- tests/test_workflow_runner.py | 17 +- tests/test_workflow_tools_mcp.py | 3 +- tests/test_workflows_command.py | 52 +-- 20 files changed, 657 insertions(+), 695 deletions(-) diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index 0643cfe..36f2e7c 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -444,7 +444,7 @@ def extract_markdown_structure(content: str) -> dict[str, Any]: def extract_rst_structure(content: str) -> dict[str, Any]: """ Extract structure from ReStructuredText (RST) content. - + Uses the enhanced unified RST parser for comprehensive extraction. RST uses underline-style headers: @@ -474,13 +474,13 @@ def extract_rst_structure(content: str) -> dict[str, Any]: # Use the enhanced unified RST parser try: from skill_seekers.cli.parsers.extractors import RstParser - + parser = RstParser() result = parser.parse_string(content, "") - + if result.success and result.document: doc = result.document - + # Convert to legacy structure format for backward compatibility structure = { "title": doc.title, @@ -531,7 +531,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]: except Exception as e: # Fall back to basic extraction if unified parser fails logger.warning(f"Enhanced RST parser failed: {e}, using basic parser") - + # Legacy basic extraction (fallback) import re diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index b50adea..83caace 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -401,13 +401,13 @@ class DocToSkillConverter: # Try enhanced unified parser first try: from skill_seekers.cli.parsers.extractors import MarkdownParser - + parser = MarkdownParser() result = parser.parse_string(content, url) - + if result.success and result.document: doc = result.document - + # Extract links from the document links = [] for link in doc.external_links: @@ -421,7 +421,7 @@ class DocToSkillConverter: full_url = full_url.split("#")[0] if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links: links.append(full_url) - + return { "url": url, "title": doc.title or "", diff --git a/src/skill_seekers/cli/enhancement_workflow.py b/src/skill_seekers/cli/enhancement_workflow.py index b535217..85886e3 100644 --- a/src/skill_seekers/cli/enhancement_workflow.py +++ b/src/skill_seekers/cli/enhancement_workflow.py @@ -24,7 +24,6 @@ Usage: import json import logging -import os from dataclasses import dataclass, field from datetime import datetime from importlib.resources import files as importlib_files @@ -145,11 +144,11 @@ class WorkflowEngine: pkg_ref = importlib_files("skill_seekers.workflows").joinpath(bare_name) yaml_text = pkg_ref.read_text(encoding="utf-8") logger.info(f"📋 Loading bundled workflow: {bare_name}") - except (FileNotFoundError, TypeError, ModuleNotFoundError): + except (FileNotFoundError, TypeError, ModuleNotFoundError) as exc: raise FileNotFoundError( f"Workflow '{yaml_ref.stem}' not found. " "Use 'skill-seekers workflows list' to see available workflows." - ) + ) from exc if resolved_path is not None: logger.info(f"📋 Loading workflow: {resolved_path}") diff --git a/src/skill_seekers/cli/parsers/extractors/__init__.py b/src/skill_seekers/cli/parsers/extractors/__init__.py index 575d5a3..12a01df 100644 --- a/src/skill_seekers/cli/parsers/extractors/__init__.py +++ b/src/skill_seekers/cli/parsers/extractors/__init__.py @@ -6,20 +6,20 @@ a standardized Document structure. Usage: from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser - + # Parse RST file parser = RstParser() result = parser.parse_file("docs/class_node.rst") - + if result.success: doc = result.document print(f"Title: {doc.title}") print(f"Tables: {len(doc.tables)}") print(f"Code blocks: {len(doc.code_blocks)}") - + # Convert to markdown markdown = doc.to_markdown() - + # Convert to skill format skill_data = doc.to_skill_format() @@ -29,7 +29,7 @@ Available Parsers: Auto-Detection: from skill_seekers.cli.parsers.extractors import parse_document - + # Automatically detects format result = parse_document("file.rst") """ diff --git a/src/skill_seekers/cli/parsers/extractors/base_parser.py b/src/skill_seekers/cli/parsers/extractors/base_parser.py index 7eb1237..362aa8d 100644 --- a/src/skill_seekers/cli/parsers/extractors/base_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/base_parser.py @@ -8,11 +8,11 @@ and implement the same interface for consistent usage. from abc import ABC, abstractmethod from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Optional, Union +from typing import Any import time import logging -from .unified_structure import Document, ExtractionStats +from .unified_structure import Document logger = logging.getLogger(__name__) @@ -20,11 +20,11 @@ logger = logging.getLogger(__name__) @dataclass class ParseResult: """Result of parsing a document.""" - document: Optional[Document] = None + document: Document | None = None success: bool = False errors: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) - + @property def is_ok(self) -> bool: """Check if parsing succeeded.""" @@ -34,18 +34,18 @@ class ParseResult: class BaseParser(ABC): """ Abstract base class for all document parsers. - + Implementations: - RstParser: ReStructuredText documents - MarkdownParser: Markdown documents - PdfParser: PDF documents - HtmlParser: HTML documents (future) """ - - def __init__(self, options: Optional[dict[str, Any]] = None): + + def __init__(self, options: dict[str, Any] | None = None): """ Initialize parser with options. - + Args: options: Parser-specific options Common options: @@ -61,26 +61,26 @@ class BaseParser(ABC): self._quality_scoring = self.options.get('quality_scoring', True) self._max_file_size = self.options.get('max_file_size_mb', 50.0) * 1024 * 1024 self._encoding = self.options.get('encoding', 'utf-8') - + @property @abstractmethod def format_name(self) -> str: """Return the format name this parser handles.""" pass - + @property @abstractmethod def supported_extensions(self) -> list[str]: """Return list of supported file extensions.""" pass - - def can_parse(self, source: Union[str, Path]) -> bool: + + def can_parse(self, source: str | Path) -> bool: """ Check if this parser can handle the given source. - + Args: source: File path or content string - + Returns: True if this parser can handle the source """ @@ -95,58 +95,58 @@ class BaseParser(ABC): except Exception: return False return False - - def parse(self, source: Union[str, Path]) -> ParseResult: + + def parse(self, source: str | Path) -> ParseResult: """ Parse a document from file path or content string. - + Args: source: File path (str/Path) or content string - + Returns: ParseResult with document or error info """ start_time = time.time() result = ParseResult() - + try: # Read source content, source_path = self._read_source_with_path(source) - + # Check file size if len(content.encode(self._encoding)) > self._max_file_size: result.errors.append(f"File too large: {source_path}") return result - + # Validate format if not self._detect_format(content): result.warnings.append(f"Content may not be valid {self.format_name}") - + # Parse content document = self._parse_content(content, source_path) - + # Post-process document = self._post_process(document) - + # Record stats processing_time = (time.time() - start_time) * 1000 if document.stats: document.stats.processing_time_ms = processing_time - + result.document = document result.success = True result.warnings.extend(document.stats.warnings) - + except Exception as e: result.errors.append(f"Parse error: {str(e)}") logger.exception(f"Error parsing {source}") - + return result - - def parse_file(self, path: Union[str, Path]) -> ParseResult: + + def parse_file(self, path: str | Path) -> ParseResult: """Parse a file from path.""" return self.parse(path) - + def parse_string(self, content: str, source_path: str = "") -> ParseResult: """Parse content from string.""" # Create a wrapper that looks like a path @@ -160,46 +160,46 @@ class BaseParser(ABC): return True def __str__(self): return self._path - + source = StringSource(content, source_path) result = self.parse(source) if result.document: result.document.source_path = source_path return result - + @abstractmethod def _parse_content(self, content: str, source_path: str) -> Document: """ Parse content string into Document. - + Args: content: Raw content to parse source_path: Original source path (for reference) - + Returns: Parsed Document """ pass - + @abstractmethod def _detect_format(self, content: str) -> bool: """ Detect if content matches this parser's format. - + Args: content: Content to check - + Returns: True if content appears to be this format """ pass - - def _read_source(self, source: Union[str, Path]) -> str: + + def _read_source(self, source: str | Path) -> str: """Read content from source.""" content, _ = self._read_source_with_path(source) return content - - def _read_source_with_path(self, source: Union[str, Path]) -> tuple[str, str]: + + def _read_source_with_path(self, source: str | Path) -> tuple[str, str]: """Read content and return with path.""" if isinstance(source, str): # Check if it's a path or content @@ -214,37 +214,37 @@ class BaseParser(ABC): else: # Assume it's a file-like object return source.read_text(encoding=self._encoding), str(source) - + def _post_process(self, document: Document) -> Document: """ Post-process document after parsing. - + Override to add cross-references, validate, etc. """ # Build heading list from blocks if not document.headings: document.headings = self._extract_headings(document) - + # Extract code blocks from blocks if not document.code_blocks: document.code_blocks = self._extract_code_blocks(document) - + # Extract tables from blocks if not document.tables: document.tables = self._extract_tables(document) - + # Update stats document.stats.total_blocks = len(document.blocks) document.stats.code_blocks = len(document.code_blocks) document.stats.tables = len(document.tables) document.stats.headings = len(document.headings) document.stats.cross_references = len(document.internal_links) + len(document.external_links) - + return document - + def _extract_headings(self, document: Document) -> list: """Extract headings from content blocks.""" - from .unified_structure import ContentBlockType, Heading + from .unified_structure import ContentBlockType headings = [] for block in document.blocks: if block.type == ContentBlockType.HEADING: @@ -252,7 +252,7 @@ class BaseParser(ABC): if heading_data: headings.append(heading_data) return headings - + def _extract_code_blocks(self, document: Document) -> list: """Extract code blocks from content blocks.""" code_blocks = [] @@ -260,7 +260,7 @@ class BaseParser(ABC): if block.metadata.get('code_data'): code_blocks.append(block.metadata['code_data']) return code_blocks - + def _extract_tables(self, document: Document) -> list: """Extract tables from content blocks.""" tables = [] @@ -268,7 +268,7 @@ class BaseParser(ABC): if block.metadata.get('table_data'): tables.append(block.metadata['table_data']) return tables - + def _create_quality_scorer(self): """Create a quality scorer if enabled.""" if self._quality_scoring: @@ -277,44 +277,44 @@ class BaseParser(ABC): return None -def get_parser_for_file(path: Union[str, Path]) -> Optional[BaseParser]: +def get_parser_for_file(path: str | Path) -> BaseParser | None: """ Get the appropriate parser for a file. - + Args: path: File path - + Returns: Appropriate parser instance or None """ path = Path(path) suffix = path.suffix.lower() - + # Try RST parser from .rst_parser import RstParser rst_parser = RstParser() if suffix in rst_parser.supported_extensions: return rst_parser - + # Try Markdown parser from .markdown_parser import MarkdownParser md_parser = MarkdownParser() if suffix in md_parser.supported_extensions: return md_parser - + # Could add PDF, HTML parsers here - + return None -def parse_document(source: Union[str, Path], format_hint: Optional[str] = None) -> ParseResult: +def parse_document(source: str | Path, format_hint: str | None = None) -> ParseResult: """ Parse a document, auto-detecting the format. - + Args: source: File path or content string format_hint: Optional format hint ('rst', 'markdown', etc.) - + Returns: ParseResult """ @@ -326,21 +326,21 @@ def parse_document(source: Union[str, Path], format_hint: Optional[str] = None) elif format_hint.lower() in ('md', 'markdown'): from .markdown_parser import MarkdownParser return MarkdownParser().parse(source) - + # Auto-detect from file extension parser = get_parser_for_file(source) if parser: return parser.parse(source) - + # Try content-based detection content = source if isinstance(source, str) else Path(source).read_text() - + # Check for RST indicators rst_indicators = ['.. ', '::\n', ':ref:`', '.. toctree::', '.. code-block::'] if any(ind in content for ind in rst_indicators): from .rst_parser import RstParser return RstParser().parse_string(content) - + # Default to Markdown from .markdown_parser import MarkdownParser return MarkdownParser().parse_string(content) diff --git a/src/skill_seekers/cli/parsers/extractors/formatters.py b/src/skill_seekers/cli/parsers/extractors/formatters.py index db92f5f..5f4cc4e 100644 --- a/src/skill_seekers/cli/parsers/extractors/formatters.py +++ b/src/skill_seekers/cli/parsers/extractors/formatters.py @@ -7,45 +7,44 @@ Convert unified Document structure to various output formats. from typing import Any from .unified_structure import ( - Document, ContentBlock, ContentBlockType, CrossRefType, - AdmonitionType, ListType, Table, CodeBlock + Document, ContentBlock, ContentBlockType, AdmonitionType, ListType, Table ) class MarkdownFormatter: """Format Document as Markdown.""" - + def __init__(self, options: dict[str, Any] = None): self.options = options or {} self.include_toc = self.options.get('include_toc', False) self.max_heading_level = self.options.get('max_heading_level', 6) self.code_block_style = self.options.get('code_block_style', 'fenced') self.table_style = self.options.get('table_style', 'github') - + def format(self, document: Document) -> str: """Convert document to markdown string.""" parts = [] - + # Title if document.title: parts.append(f"# {document.title}\n") - + # Metadata as YAML frontmatter if document.meta: parts.append(self._format_metadata(document.meta)) - + # Table of contents if self.include_toc and document.headings: parts.append(self._format_toc(document.headings)) - + # Content blocks for block in document.blocks: formatted = self._format_block(block) if formatted: parts.append(formatted) - + return '\n'.join(parts) - + def _format_metadata(self, meta: dict) -> str: """Format metadata as YAML frontmatter.""" lines = ['---'] @@ -58,7 +57,7 @@ class MarkdownFormatter: lines.append(f"{key}: {value}") lines.append('---\n') return '\n'.join(lines) - + def _format_toc(self, headings: list) -> str: """Format table of contents.""" lines = ['## Table of Contents\n'] @@ -69,7 +68,7 @@ class MarkdownFormatter: lines.append(f"{indent}- [{h.text}](#{anchor})") lines.append('') return '\n'.join(lines) - + def _format_block(self, block: ContentBlock) -> str: """Format a single content block.""" handlers = { @@ -86,14 +85,14 @@ class MarkdownFormatter: ContentBlockType.DEFINITION_LIST: self._format_definition_list, ContentBlockType.META: self._format_meta, } - + handler = handlers.get(block.type) if handler: return handler(block) - + # Default: return content as-is return block.content + '\n' - + def _format_heading(self, block: ContentBlock) -> str: """Format heading block.""" heading_data = block.metadata.get('heading_data') @@ -103,87 +102,84 @@ class MarkdownFormatter: else: level = block.metadata.get('level', 1) text = block.content - + if level > self.max_heading_level: return f"**{text}**\n" - + return f"{'#' * level} {text}\n" - + def _format_paragraph(self, block: ContentBlock) -> str: """Format paragraph block.""" return block.content + '\n' - + def _format_code_block(self, block: ContentBlock) -> str: """Format code block.""" code_data = block.metadata.get('code_data') - + if code_data: code = code_data.code lang = code_data.language or '' else: code = block.content lang = block.metadata.get('language', '') - + if self.code_block_style == 'fenced': return f"```{lang}\n{code}\n```\n" else: # Indented style indented = '\n'.join(' ' + line for line in code.split('\n')) return indented + '\n' - + def _format_table(self, block: ContentBlock) -> str: """Format table block.""" table_data = block.metadata.get('table_data') if not table_data: return '' - + return self._format_table_data(table_data) - + def _format_table_data(self, table: Table) -> str: """Format table data as markdown.""" if not table.rows: return '' - + lines = [] - + # Caption if table.caption: lines.append(f"**{table.caption}**\n") - + # Headers headers = table.headers or table.rows[0] lines.append('| ' + ' | '.join(headers) + ' |') lines.append('|' + '|'.join('---' for _ in headers) + '|') - + # Rows (skip first if used as headers) start_row = 0 if table.headers else 1 for row in table.rows[start_row:]: # Pad row to match header count padded_row = row + [''] * (len(headers) - len(row)) lines.append('| ' + ' | '.join(padded_row[:len(headers)]) + ' |') - + lines.append('') return '\n'.join(lines) - + def _format_list(self, block: ContentBlock) -> str: """Format list block.""" list_type = block.metadata.get('list_type', ListType.BULLET) items = block.metadata.get('items', []) - + if not items: return block.content + '\n' - + lines = [] for i, item in enumerate(items): - if list_type == ListType.NUMBERED: - prefix = f"{i + 1}." - else: - prefix = "-" + prefix = f"{i + 1}." if list_type == ListType.NUMBERED else "-" lines.append(f"{prefix} {item}") - + lines.append('') return '\n'.join(lines) - + def _format_image(self, block: ContentBlock) -> str: """Format image block.""" image_data = block.metadata.get('image_data') @@ -193,9 +189,9 @@ class MarkdownFormatter: else: src = block.metadata.get('src', '') alt = block.metadata.get('alt', '') - + return f"![{alt}]({src})\n" - + def _format_cross_ref(self, block: ContentBlock) -> str: """Format cross-reference block.""" xref_data = block.metadata.get('xref_data') @@ -203,13 +199,13 @@ class MarkdownFormatter: text = xref_data.text or xref_data.target target = xref_data.target return f"[{text}](#{target})\n" - + return block.content + '\n' - + def _format_admonition(self, block: ContentBlock) -> str: """Format admonition/callout block.""" admonition_type = block.metadata.get('admonition_type', AdmonitionType.NOTE) - + # GitHub-style admonitions type_map = { AdmonitionType.NOTE: 'NOTE', @@ -218,16 +214,16 @@ class MarkdownFormatter: AdmonitionType.IMPORTANT: 'IMPORTANT', AdmonitionType.CAUTION: 'CAUTION', } - + type_str = type_map.get(admonition_type, 'NOTE') content = block.content - + return f"> [!{type_str}]\n> {content.replace(chr(10), chr(10) + '> ')}\n" - + def _format_directive(self, block: ContentBlock) -> str: """Format directive block (RST-specific).""" directive_name = block.metadata.get('directive_name', 'unknown') - + # Format as a blockquote with directive name content = block.content lines = [f"> **{directive_name}**"] @@ -235,13 +231,13 @@ class MarkdownFormatter: lines.append(f"> {line}") lines.append('') return '\n'.join(lines) - + def _format_field_list(self, block: ContentBlock) -> str: """Format field list block.""" fields = block.metadata.get('fields', []) if not fields: return block.content + '\n' - + lines = [] for field in fields: if field.arg: @@ -250,13 +246,13 @@ class MarkdownFormatter: lines.append(f"**{field.name}**: {field.content}") lines.append('') return '\n'.join(lines) - + def _format_definition_list(self, block: ContentBlock) -> str: """Format definition list block.""" items = block.metadata.get('items', []) if not items: return block.content + '\n' - + lines = [] for item in items: if item.classifier: @@ -266,7 +262,7 @@ class MarkdownFormatter: lines.append(f": {item.definition}") lines.append('') return '\n'.join(lines) - + def _format_meta(self, block: ContentBlock) -> str: """Format metadata block (usually filtered out).""" return '' # Metadata goes in YAML frontmatter @@ -274,7 +270,7 @@ class MarkdownFormatter: class SkillFormatter: """Format Document for skill-seekers internal use.""" - + def format(self, document: Document) -> dict[str, Any]: """Format document for skill output.""" return { @@ -324,7 +320,7 @@ class SkillFormatter: "processing_time_ms": document.stats.processing_time_ms, } } - + def _extract_summary(self, document: Document, max_length: int = 500) -> str: """Extract a text summary from the document.""" paragraphs = [] @@ -333,22 +329,22 @@ class SkillFormatter: paragraphs.append(block.content) if len(' '.join(paragraphs)) > max_length: break - + summary = ' '.join(paragraphs) if len(summary) > max_length: summary = summary[:max_length - 3] + '...' - + return summary - + def _score_table(self, table: Table) -> float: """Quick table quality score.""" if not table.rows: return 0.0 - + score = 5.0 if table.headers: score += 2.0 if 2 <= len(table.rows) <= 50: score += 1.0 - + return min(10.0, score) diff --git a/src/skill_seekers/cli/parsers/extractors/markdown_parser.py b/src/skill_seekers/cli/parsers/extractors/markdown_parser.py index 4d68d47..e357569 100644 --- a/src/skill_seekers/cli/parsers/extractors/markdown_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/markdown_parser.py @@ -17,13 +17,12 @@ Enhanced with quality scoring and table support. """ import re -from pathlib import Path -from typing import Any, Optional +from typing import Any from .base_parser import BaseParser from .unified_structure import ( Document, ContentBlock, ContentBlockType, CrossReference, CrossRefType, - AdmonitionType, Heading, CodeBlock, Table, Image, ListType, ExtractionStats + AdmonitionType, Heading, CodeBlock, Table, Image, ListType ) from .quality_scorer import QualityScorer @@ -31,10 +30,10 @@ from .quality_scorer import QualityScorer class MarkdownParser(BaseParser): """ Parser for Markdown documents. - + Supports standard Markdown and GitHub-flavored Markdown (GFM). """ - + # Admonition types for GitHub-style callouts ADMONITION_TYPES = { 'note': AdmonitionType.NOTE, @@ -46,21 +45,21 @@ class MarkdownParser(BaseParser): 'danger': AdmonitionType.DANGER, 'attention': AdmonitionType.ATTENTION, } - - def __init__(self, options: Optional[dict[str, Any]] = None): + + def __init__(self, options: dict[str, Any] | None = None): super().__init__(options) self.quality_scorer = QualityScorer() self._lines: list[str] = [] self._current_line = 0 - + @property def format_name(self) -> str: return 'markdown' - + @property def supported_extensions(self) -> list[str]: return ['.md', '.markdown', '.mdown', '.mkd'] - + def _detect_format(self, content: str) -> bool: """Detect if content is Markdown.""" md_indicators = [ @@ -71,34 +70,31 @@ class MarkdownParser(BaseParser): r'^\s*[-*+]\s+\S', # Lists r'^>\s+\S', # Blockquotes ] - for pattern in md_indicators: - if re.search(pattern, content, re.MULTILINE): - return True - return False - + return any(re.search(pattern, content, re.MULTILINE) for pattern in md_indicators) + def _parse_content(self, content: str, source_path: str) -> Document: """Parse Markdown content into Document.""" self._lines = content.split('\n') self._current_line = 0 - + document = Document( title='', format='markdown', source_path=source_path, ) - + # Parse frontmatter if present frontmatter = self._parse_frontmatter() if frontmatter: document.meta.update(frontmatter) - + # Parse content blocks while self._current_line < len(self._lines): block = self._parse_block() if block: document.blocks.append(block) self._current_line += 1 - + # Extract title from first h1 or frontmatter if document.meta.get('title'): document.title = document.meta['title'] @@ -109,55 +105,55 @@ class MarkdownParser(BaseParser): if heading_data and heading_data.level == 1: document.title = heading_data.text break - + # Extract specialized content self._extract_specialized_content(document) - + return document - - def _parse_frontmatter(self) -> Optional[dict]: + + def _parse_frontmatter(self) -> dict | None: """Parse YAML frontmatter if present.""" if self._current_line >= len(self._lines): return None - + first_line = self._lines[self._current_line].strip() if first_line != '---': return None - + # Find closing --- end_line = None for i in range(self._current_line + 1, len(self._lines)): if self._lines[i].strip() == '---': end_line = i break - + if end_line is None: return None - + # Extract frontmatter content frontmatter_lines = self._lines[self._current_line + 1:end_line] - frontmatter_content = '\n'.join(frontmatter_lines) - + '\n'.join(frontmatter_lines) + # Simple key: value parsing (not full YAML) meta = {} current_key = None current_value = [] - + for line in frontmatter_lines: stripped = line.strip() if not stripped: continue - + # Check for new key match = re.match(r'^(\w+):\s*(.*)$', stripped) if match: # Save previous key if current_key: meta[current_key] = '\n'.join(current_value).strip() - + current_key = match.group(1) value = match.group(2) - + # Handle inline value if value: # Check if it's a list @@ -178,146 +174,146 @@ class MarkdownParser(BaseParser): meta[current_key].append(stripped[2:].strip().strip('"\'')) elif current_key: current_value.append(stripped) - + # Save last key if current_key: meta[current_key] = '\n'.join(current_value).strip() - + # Advance past frontmatter self._current_line = end_line + 1 - + return meta - - def _parse_block(self) -> Optional[ContentBlock]: + + def _parse_block(self) -> ContentBlock | None: """Parse a single block at current position.""" line = self._current_line if line >= len(self._lines): return None - + current = self._lines[line] stripped = current.strip() - + # Skip empty lines if not stripped: return None - + # Skip HTML comments if stripped.startswith('' in line: break - + self._current_line += 1 - + # Skip comments in output (could optionally include) return None - + def _parse_horizontal_rule(self) -> ContentBlock: """Parse horizontal rule.""" return ContentBlock( @@ -543,28 +535,28 @@ class MarkdownParser(BaseParser): metadata={'element': 'horizontal_rule'}, source_line=self._current_line + 1, ) - - def _detect_list_type(self, stripped: str) -> Optional[ListType]: + + def _detect_list_type(self, stripped: str) -> ListType | None: """Detect if line starts a list and which type.""" if re.match(r'^[-*+]\s+', stripped): return ListType.BULLET if re.match(r'^\d+\.\s+', stripped): return ListType.NUMBERED return None - + def _parse_list(self, list_type: ListType) -> ContentBlock: """Parse a list.""" items = [] start_line = self._current_line - + while self._current_line < len(self._lines): line = self._lines[self._current_line] stripped = line.strip() - + if not stripped: self._current_line += 1 continue - + # Check if still in list if list_type == ListType.BULLET: match = re.match(r'^[-*+]\s+(.+)$', stripped) @@ -578,9 +570,9 @@ class MarkdownParser(BaseParser): self._current_line -= 1 break items.append(match.group(1)) - + self._current_line += 1 - + return ContentBlock( type=ContentBlockType.LIST, content=f"{len(items)} items", @@ -590,20 +582,20 @@ class MarkdownParser(BaseParser): }, source_line=start_line + 1, ) - + def _parse_paragraph(self) -> ContentBlock: """Parse a paragraph.""" lines = [] start_line = self._current_line - + while self._current_line < len(self._lines): line = self._lines[self._current_line] stripped = line.strip() - + # End of paragraph if not stripped: break - + # Check for block-level elements if stripped.startswith('#'): break @@ -619,45 +611,45 @@ class MarkdownParser(BaseParser): break if self._is_setext_header(self._current_line): break - + lines.append(stripped) self._current_line += 1 - + content = ' '.join(lines) - + # Process inline elements content = self._process_inline(content) - + return ContentBlock( type=ContentBlockType.PARAGRAPH, content=content, source_line=start_line + 1, ) - + def _process_inline(self, text: str) -> str: """Process inline Markdown elements.""" # Links [text](url) text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\1](\2)', text) - + # Images ![alt](url) text = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', r'![\1](\2)', text) - + # Code `code` text = re.sub(r'`([^`]+)`', r'`\1`', text) - + # Bold **text** or __text__ text = re.sub(r'\*\*([^*]+)\*\*', r'**\1**', text) text = re.sub(r'__([^_]+)__', r'**\1**', text) - + # Italic *text* or _text_ text = re.sub(r'(? str: """Create URL anchor from heading text.""" anchor = text.lower() @@ -665,7 +657,7 @@ class MarkdownParser(BaseParser): anchor = anchor.replace(' ', '-') anchor = re.sub(r'-+', '-', anchor) return anchor.strip('-') - + def _extract_specialized_content(self, document: Document): """Extract specialized content lists from blocks.""" for block in document.blocks: @@ -674,19 +666,19 @@ class MarkdownParser(BaseParser): heading_data = block.metadata.get('heading_data') if heading_data: document.headings.append(heading_data) - + # Extract code blocks elif block.type == ContentBlockType.CODE_BLOCK: code_data = block.metadata.get('code_data') if code_data: document.code_blocks.append(code_data) - + # Extract tables elif block.type == ContentBlockType.TABLE: table_data = block.metadata.get('table_data') if table_data: document.tables.append(table_data) - + # Extract images from paragraphs (simplified) elif block.type == ContentBlockType.PARAGRAPH: content = block.content @@ -698,7 +690,7 @@ class MarkdownParser(BaseParser): source_line=block.source_line, ) document.images.append(image) - + # Extract links link_matches = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content) for text, url in link_matches: @@ -709,14 +701,14 @@ class MarkdownParser(BaseParser): ref_type = CrossRefType.EXTERNAL else: ref_type = CrossRefType.INTERNAL - + xref = CrossReference( ref_type=ref_type, target=url, text=text, source_line=block.source_line, ) - + if ref_type == CrossRefType.EXTERNAL: document.external_links.append(xref) else: diff --git a/src/skill_seekers/cli/parsers/extractors/pdf_parser.py b/src/skill_seekers/cli/parsers/extractors/pdf_parser.py index 0d260ae..4490b51 100644 --- a/src/skill_seekers/cli/parsers/extractors/pdf_parser.py +++ b/src/skill_seekers/cli/parsers/extractors/pdf_parser.py @@ -5,7 +5,7 @@ Wraps PDFExtractor to provide unified Document output. """ from pathlib import Path -from typing import Any, Optional +from typing import Any from .base_parser import BaseParser, ParseResult from .quality_scorer import QualityScorer @@ -14,7 +14,6 @@ from .unified_structure import ( ContentBlock, ContentBlockType, Document, - ExtractionStats, Heading, Image, Table, @@ -33,13 +32,13 @@ except ImportError: class PdfParser(BaseParser): """ Parser for PDF documents. - + Wraps the existing PDFExtractor to provide unified Document output while maintaining all PDF-specific features (OCR, image extraction, table extraction, etc.). """ - def __init__(self, options: Optional[dict[str, Any]] = None): + def __init__(self, options: dict[str, Any] | None = None): super().__init__(options) self.pdf_options = { "verbose": self.options.get("verbose", False), @@ -71,7 +70,7 @@ class PdfParser(BaseParser): def _parse_content(self, content: str, source_path: str) -> Document: """ Parse PDF content into Document. - + Note: For PDF, we need the file path, not content string. This method is mainly for API compatibility. """ @@ -83,10 +82,10 @@ class PdfParser(BaseParser): def parse_file(self, path: str | Path) -> ParseResult: """ Parse a PDF file. - + Args: path: Path to PDF file - + Returns: ParseResult with Document or error info """ @@ -97,7 +96,7 @@ class PdfParser(BaseParser): result.errors.append(f"File not found: {path}") return result - if not path.suffix.lower() == ".pdf": + if path.suffix.lower() != ".pdf": result.errors.append(f"Not a PDF file: {path}") return result @@ -127,7 +126,7 @@ class PdfParser(BaseParser): # Convert to unified Document document = self._convert_to_document(extraction_result, str(path)) - + result.document = document result.success = True result.warnings.extend(document.stats.warnings) @@ -157,13 +156,13 @@ class PdfParser(BaseParser): # Process pages pages = extraction_result.get("pages", []) - + for page_num, page_data in enumerate(pages): # Add page heading page_heading = f"Page {page_num + 1}" if page_data.get("headings"): page_heading = page_data["headings"][0].get("text", page_heading) - + document.blocks.append( ContentBlock( type=ContentBlockType.HEADING, @@ -200,7 +199,7 @@ class PdfParser(BaseParser): source_line=page_num + 1, ) document.code_blocks.append(code_block) - + document.blocks.append( ContentBlock( type=ContentBlockType.CODE_BLOCK, @@ -224,7 +223,7 @@ class PdfParser(BaseParser): source_line=page_num + 1, ) document.tables.append(table) - + quality = self.quality_scorer.score_table(table) document.blocks.append( ContentBlock( @@ -268,7 +267,7 @@ class PdfParser(BaseParser): def parse(self, source: str | Path) -> ParseResult: """ Parse PDF from source. - + For PDF files, source should be a file path. """ if isinstance(source, str) and Path(source).exists(): diff --git a/src/skill_seekers/cli/parsers/extractors/quality_scorer.py b/src/skill_seekers/cli/parsers/extractors/quality_scorer.py index f2bc836..4c377d6 100644 --- a/src/skill_seekers/cli/parsers/extractors/quality_scorer.py +++ b/src/skill_seekers/cli/parsers/extractors/quality_scorer.py @@ -8,14 +8,13 @@ Provides consistent quality scoring across all parsers for: """ import re -from typing import Optional -from .unified_structure import CodeBlock, Table, ContentBlock +from .unified_structure import Table, ContentBlock class QualityScorer: """Score the quality of extracted content.""" - + # Language patterns for detection and validation LANGUAGE_PATTERNS = { 'python': { @@ -122,26 +121,26 @@ class QualityScorer: ], }, } - - def score_code_block(self, code: str, language: Optional[str] = None) -> float: + + def score_code_block(self, code: str, language: str | None = None) -> float: """ Score a code block for quality (0-10). - + Args: code: The code content language: Detected or specified language - + Returns: Quality score from 0-10 """ score = 5.0 # Start neutral - + if not code or not code.strip(): return 0.0 - + code = code.strip() - lines = [l for l in code.split('\n') if l.strip()] - + lines = [line for line in code.split('\n') if line.strip()] + # Factor 1: Length appropriateness code_len = len(code) if 50 <= code_len <= 1000: @@ -150,22 +149,22 @@ class QualityScorer: score -= 1.0 # Too long elif code_len < 20: score -= 2.0 # Too short - + # Factor 2: Line count if 3 <= len(lines) <= 50: score += 1.0 elif len(lines) > 100: score -= 0.5 - + # Factor 3: Language-specific validation if language and language in self.LANGUAGE_PATTERNS: lang_patterns = self.LANGUAGE_PATTERNS[language] - + # Check for keywords keyword_matches = sum(1 for kw in lang_patterns['keywords'] if kw in code) if keyword_matches >= 2: score += 1.0 - + # Check for syntax patterns syntax_matches = sum( 1 for pattern, _ in lang_patterns['syntax_checks'] @@ -173,27 +172,27 @@ class QualityScorer: ) if syntax_matches >= 1: score += 1.0 - + # Factor 4: Structural quality # Check for function/class definitions if re.search(r'\b(def|function|func|fn|class|public class)\b', code): score += 1.5 - + # Check for meaningful variable names (not just x, y, i) meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower()) if len(meaningful_vars) >= 3: score += 0.5 - + # Factor 5: Syntax validation (generic) is_valid, issues = self._validate_syntax(code, language) if is_valid: score += 1.0 else: score -= len(issues) * 0.3 - + # Factor 6: Comment/code ratio comment_lines = sum( - 1 for line in lines + 1 for line in lines if line.strip().startswith(('#', '//', '/*', '*', '--', '