fix: resolve all ruff linting errors (W293, F401, B904, UP007, UP045, E741, SIM102, SIM117, ARG)

Auto-fixed (whitespace, imports, type annotations):
- codebase_scraper.py: W293 blank lines with whitespace
- doc_scraper.py: W293 blank lines with whitespace
- parsers/extractors/__init__.py: W293
- parsers/extractors/base_parser.py: W293, UP007, UP045, F401

Manual fixes:
- enhancement_workflow.py: B904 raise without `from exc`, remove unused `os` import
- parsers/extractors/quality_scorer.py: E741 ambiguous var `l` → `line`
- parsers/extractors/rst_parser.py: SIM102 nested if → combined conditions (x2)
- pdf_scraper.py: F821 undefined `logger` → `print()` (consistent with file style)
- mcp/tools/workflow_tools.py: ARG001 unused `args` → `_args`
- tests/test_workflow_runner.py: ARG005 unused lambda args → `_a`/`_kw`, ARG001 `kwargs` → `_kwargs`
- tests/test_workflows_command.py: SIM117 nested with → combined with (x2)

All 1922 tests pass.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-18 22:44:41 +03:00
parent c44b88e801
commit 0878ad3ef6
20 changed files with 657 additions and 695 deletions

View File

@@ -444,7 +444,7 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
def extract_rst_structure(content: str) -> dict[str, Any]:
"""
Extract structure from ReStructuredText (RST) content.
Uses the enhanced unified RST parser for comprehensive extraction.
RST uses underline-style headers:
@@ -474,13 +474,13 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
# Use the enhanced unified RST parser
try:
from skill_seekers.cli.parsers.extractors import RstParser
parser = RstParser()
result = parser.parse_string(content, "<string>")
if result.success and result.document:
doc = result.document
# Convert to legacy structure format for backward compatibility
structure = {
"title": doc.title,
@@ -531,7 +531,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
except Exception as e:
# Fall back to basic extraction if unified parser fails
logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
# Legacy basic extraction (fallback)
import re

View File

@@ -401,13 +401,13 @@ class DocToSkillConverter:
# Try enhanced unified parser first
try:
from skill_seekers.cli.parsers.extractors import MarkdownParser
parser = MarkdownParser()
result = parser.parse_string(content, url)
if result.success and result.document:
doc = result.document
# Extract links from the document
links = []
for link in doc.external_links:
@@ -421,7 +421,7 @@ class DocToSkillConverter:
full_url = full_url.split("#")[0]
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
links.append(full_url)
return {
"url": url,
"title": doc.title or "",

View File

@@ -24,7 +24,6 @@ Usage:
import json
import logging
import os
from dataclasses import dataclass, field
from datetime import datetime
from importlib.resources import files as importlib_files
@@ -145,11 +144,11 @@ class WorkflowEngine:
pkg_ref = importlib_files("skill_seekers.workflows").joinpath(bare_name)
yaml_text = pkg_ref.read_text(encoding="utf-8")
logger.info(f"📋 Loading bundled workflow: {bare_name}")
except (FileNotFoundError, TypeError, ModuleNotFoundError):
except (FileNotFoundError, TypeError, ModuleNotFoundError) as exc:
raise FileNotFoundError(
f"Workflow '{yaml_ref.stem}' not found. "
"Use 'skill-seekers workflows list' to see available workflows."
)
) from exc
if resolved_path is not None:
logger.info(f"📋 Loading workflow: {resolved_path}")

View File

@@ -6,20 +6,20 @@ a standardized Document structure.
Usage:
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
# Parse RST file
parser = RstParser()
result = parser.parse_file("docs/class_node.rst")
if result.success:
doc = result.document
print(f"Title: {doc.title}")
print(f"Tables: {len(doc.tables)}")
print(f"Code blocks: {len(doc.code_blocks)}")
# Convert to markdown
markdown = doc.to_markdown()
# Convert to skill format
skill_data = doc.to_skill_format()
@@ -29,7 +29,7 @@ Available Parsers:
Auto-Detection:
from skill_seekers.cli.parsers.extractors import parse_document
# Automatically detects format
result = parse_document("file.rst")
"""

View File

@@ -8,11 +8,11 @@ and implement the same interface for consistent usage.
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional, Union
from typing import Any
import time
import logging
from .unified_structure import Document, ExtractionStats
from .unified_structure import Document
logger = logging.getLogger(__name__)
@@ -20,11 +20,11 @@ logger = logging.getLogger(__name__)
@dataclass
class ParseResult:
"""Result of parsing a document."""
document: Optional[Document] = None
document: Document | None = None
success: bool = False
errors: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
@property
def is_ok(self) -> bool:
"""Check if parsing succeeded."""
@@ -34,18 +34,18 @@ class ParseResult:
class BaseParser(ABC):
"""
Abstract base class for all document parsers.
Implementations:
- RstParser: ReStructuredText documents
- MarkdownParser: Markdown documents
- PdfParser: PDF documents
- HtmlParser: HTML documents (future)
"""
def __init__(self, options: Optional[dict[str, Any]] = None):
def __init__(self, options: dict[str, Any] | None = None):
"""
Initialize parser with options.
Args:
options: Parser-specific options
Common options:
@@ -61,26 +61,26 @@ class BaseParser(ABC):
self._quality_scoring = self.options.get('quality_scoring', True)
self._max_file_size = self.options.get('max_file_size_mb', 50.0) * 1024 * 1024
self._encoding = self.options.get('encoding', 'utf-8')
@property
@abstractmethod
def format_name(self) -> str:
"""Return the format name this parser handles."""
pass
@property
@abstractmethod
def supported_extensions(self) -> list[str]:
"""Return list of supported file extensions."""
pass
def can_parse(self, source: Union[str, Path]) -> bool:
def can_parse(self, source: str | Path) -> bool:
"""
Check if this parser can handle the given source.
Args:
source: File path or content string
Returns:
True if this parser can handle the source
"""
@@ -95,58 +95,58 @@ class BaseParser(ABC):
except Exception:
return False
return False
def parse(self, source: Union[str, Path]) -> ParseResult:
def parse(self, source: str | Path) -> ParseResult:
"""
Parse a document from file path or content string.
Args:
source: File path (str/Path) or content string
Returns:
ParseResult with document or error info
"""
start_time = time.time()
result = ParseResult()
try:
# Read source
content, source_path = self._read_source_with_path(source)
# Check file size
if len(content.encode(self._encoding)) > self._max_file_size:
result.errors.append(f"File too large: {source_path}")
return result
# Validate format
if not self._detect_format(content):
result.warnings.append(f"Content may not be valid {self.format_name}")
# Parse content
document = self._parse_content(content, source_path)
# Post-process
document = self._post_process(document)
# Record stats
processing_time = (time.time() - start_time) * 1000
if document.stats:
document.stats.processing_time_ms = processing_time
result.document = document
result.success = True
result.warnings.extend(document.stats.warnings)
except Exception as e:
result.errors.append(f"Parse error: {str(e)}")
logger.exception(f"Error parsing {source}")
return result
def parse_file(self, path: Union[str, Path]) -> ParseResult:
def parse_file(self, path: str | Path) -> ParseResult:
"""Parse a file from path."""
return self.parse(path)
def parse_string(self, content: str, source_path: str = "<string>") -> ParseResult:
"""Parse content from string."""
# Create a wrapper that looks like a path
@@ -160,46 +160,46 @@ class BaseParser(ABC):
return True
def __str__(self):
return self._path
source = StringSource(content, source_path)
result = self.parse(source)
if result.document:
result.document.source_path = source_path
return result
@abstractmethod
def _parse_content(self, content: str, source_path: str) -> Document:
"""
Parse content string into Document.
Args:
content: Raw content to parse
source_path: Original source path (for reference)
Returns:
Parsed Document
"""
pass
@abstractmethod
def _detect_format(self, content: str) -> bool:
"""
Detect if content matches this parser's format.
Args:
content: Content to check
Returns:
True if content appears to be this format
"""
pass
def _read_source(self, source: Union[str, Path]) -> str:
def _read_source(self, source: str | Path) -> str:
"""Read content from source."""
content, _ = self._read_source_with_path(source)
return content
def _read_source_with_path(self, source: Union[str, Path]) -> tuple[str, str]:
def _read_source_with_path(self, source: str | Path) -> tuple[str, str]:
"""Read content and return with path."""
if isinstance(source, str):
# Check if it's a path or content
@@ -214,37 +214,37 @@ class BaseParser(ABC):
else:
# Assume it's a file-like object
return source.read_text(encoding=self._encoding), str(source)
def _post_process(self, document: Document) -> Document:
"""
Post-process document after parsing.
Override to add cross-references, validate, etc.
"""
# Build heading list from blocks
if not document.headings:
document.headings = self._extract_headings(document)
# Extract code blocks from blocks
if not document.code_blocks:
document.code_blocks = self._extract_code_blocks(document)
# Extract tables from blocks
if not document.tables:
document.tables = self._extract_tables(document)
# Update stats
document.stats.total_blocks = len(document.blocks)
document.stats.code_blocks = len(document.code_blocks)
document.stats.tables = len(document.tables)
document.stats.headings = len(document.headings)
document.stats.cross_references = len(document.internal_links) + len(document.external_links)
return document
def _extract_headings(self, document: Document) -> list:
"""Extract headings from content blocks."""
from .unified_structure import ContentBlockType, Heading
from .unified_structure import ContentBlockType
headings = []
for block in document.blocks:
if block.type == ContentBlockType.HEADING:
@@ -252,7 +252,7 @@ class BaseParser(ABC):
if heading_data:
headings.append(heading_data)
return headings
def _extract_code_blocks(self, document: Document) -> list:
"""Extract code blocks from content blocks."""
code_blocks = []
@@ -260,7 +260,7 @@ class BaseParser(ABC):
if block.metadata.get('code_data'):
code_blocks.append(block.metadata['code_data'])
return code_blocks
def _extract_tables(self, document: Document) -> list:
"""Extract tables from content blocks."""
tables = []
@@ -268,7 +268,7 @@ class BaseParser(ABC):
if block.metadata.get('table_data'):
tables.append(block.metadata['table_data'])
return tables
def _create_quality_scorer(self):
"""Create a quality scorer if enabled."""
if self._quality_scoring:
@@ -277,44 +277,44 @@ class BaseParser(ABC):
return None
def get_parser_for_file(path: Union[str, Path]) -> Optional[BaseParser]:
def get_parser_for_file(path: str | Path) -> BaseParser | None:
"""
Get the appropriate parser for a file.
Args:
path: File path
Returns:
Appropriate parser instance or None
"""
path = Path(path)
suffix = path.suffix.lower()
# Try RST parser
from .rst_parser import RstParser
rst_parser = RstParser()
if suffix in rst_parser.supported_extensions:
return rst_parser
# Try Markdown parser
from .markdown_parser import MarkdownParser
md_parser = MarkdownParser()
if suffix in md_parser.supported_extensions:
return md_parser
# Could add PDF, HTML parsers here
return None
def parse_document(source: Union[str, Path], format_hint: Optional[str] = None) -> ParseResult:
def parse_document(source: str | Path, format_hint: str | None = None) -> ParseResult:
"""
Parse a document, auto-detecting the format.
Args:
source: File path or content string
format_hint: Optional format hint ('rst', 'markdown', etc.)
Returns:
ParseResult
"""
@@ -326,21 +326,21 @@ def parse_document(source: Union[str, Path], format_hint: Optional[str] = None)
elif format_hint.lower() in ('md', 'markdown'):
from .markdown_parser import MarkdownParser
return MarkdownParser().parse(source)
# Auto-detect from file extension
parser = get_parser_for_file(source)
if parser:
return parser.parse(source)
# Try content-based detection
content = source if isinstance(source, str) else Path(source).read_text()
# Check for RST indicators
rst_indicators = ['.. ', '::\n', ':ref:`', '.. toctree::', '.. code-block::']
if any(ind in content for ind in rst_indicators):
from .rst_parser import RstParser
return RstParser().parse_string(content)
# Default to Markdown
from .markdown_parser import MarkdownParser
return MarkdownParser().parse_string(content)

View File

@@ -7,45 +7,44 @@ Convert unified Document structure to various output formats.
from typing import Any
from .unified_structure import (
Document, ContentBlock, ContentBlockType, CrossRefType,
AdmonitionType, ListType, Table, CodeBlock
Document, ContentBlock, ContentBlockType, AdmonitionType, ListType, Table
)
class MarkdownFormatter:
"""Format Document as Markdown."""
def __init__(self, options: dict[str, Any] = None):
self.options = options or {}
self.include_toc = self.options.get('include_toc', False)
self.max_heading_level = self.options.get('max_heading_level', 6)
self.code_block_style = self.options.get('code_block_style', 'fenced')
self.table_style = self.options.get('table_style', 'github')
def format(self, document: Document) -> str:
"""Convert document to markdown string."""
parts = []
# Title
if document.title:
parts.append(f"# {document.title}\n")
# Metadata as YAML frontmatter
if document.meta:
parts.append(self._format_metadata(document.meta))
# Table of contents
if self.include_toc and document.headings:
parts.append(self._format_toc(document.headings))
# Content blocks
for block in document.blocks:
formatted = self._format_block(block)
if formatted:
parts.append(formatted)
return '\n'.join(parts)
def _format_metadata(self, meta: dict) -> str:
"""Format metadata as YAML frontmatter."""
lines = ['---']
@@ -58,7 +57,7 @@ class MarkdownFormatter:
lines.append(f"{key}: {value}")
lines.append('---\n')
return '\n'.join(lines)
def _format_toc(self, headings: list) -> str:
"""Format table of contents."""
lines = ['## Table of Contents\n']
@@ -69,7 +68,7 @@ class MarkdownFormatter:
lines.append(f"{indent}- [{h.text}](#{anchor})")
lines.append('')
return '\n'.join(lines)
def _format_block(self, block: ContentBlock) -> str:
"""Format a single content block."""
handlers = {
@@ -86,14 +85,14 @@ class MarkdownFormatter:
ContentBlockType.DEFINITION_LIST: self._format_definition_list,
ContentBlockType.META: self._format_meta,
}
handler = handlers.get(block.type)
if handler:
return handler(block)
# Default: return content as-is
return block.content + '\n'
def _format_heading(self, block: ContentBlock) -> str:
"""Format heading block."""
heading_data = block.metadata.get('heading_data')
@@ -103,87 +102,84 @@ class MarkdownFormatter:
else:
level = block.metadata.get('level', 1)
text = block.content
if level > self.max_heading_level:
return f"**{text}**\n"
return f"{'#' * level} {text}\n"
def _format_paragraph(self, block: ContentBlock) -> str:
"""Format paragraph block."""
return block.content + '\n'
def _format_code_block(self, block: ContentBlock) -> str:
"""Format code block."""
code_data = block.metadata.get('code_data')
if code_data:
code = code_data.code
lang = code_data.language or ''
else:
code = block.content
lang = block.metadata.get('language', '')
if self.code_block_style == 'fenced':
return f"```{lang}\n{code}\n```\n"
else:
# Indented style
indented = '\n'.join(' ' + line for line in code.split('\n'))
return indented + '\n'
def _format_table(self, block: ContentBlock) -> str:
"""Format table block."""
table_data = block.metadata.get('table_data')
if not table_data:
return ''
return self._format_table_data(table_data)
def _format_table_data(self, table: Table) -> str:
"""Format table data as markdown."""
if not table.rows:
return ''
lines = []
# Caption
if table.caption:
lines.append(f"**{table.caption}**\n")
# Headers
headers = table.headers or table.rows[0]
lines.append('| ' + ' | '.join(headers) + ' |')
lines.append('|' + '|'.join('---' for _ in headers) + '|')
# Rows (skip first if used as headers)
start_row = 0 if table.headers else 1
for row in table.rows[start_row:]:
# Pad row to match header count
padded_row = row + [''] * (len(headers) - len(row))
lines.append('| ' + ' | '.join(padded_row[:len(headers)]) + ' |')
lines.append('')
return '\n'.join(lines)
def _format_list(self, block: ContentBlock) -> str:
"""Format list block."""
list_type = block.metadata.get('list_type', ListType.BULLET)
items = block.metadata.get('items', [])
if not items:
return block.content + '\n'
lines = []
for i, item in enumerate(items):
if list_type == ListType.NUMBERED:
prefix = f"{i + 1}."
else:
prefix = "-"
prefix = f"{i + 1}." if list_type == ListType.NUMBERED else "-"
lines.append(f"{prefix} {item}")
lines.append('')
return '\n'.join(lines)
def _format_image(self, block: ContentBlock) -> str:
"""Format image block."""
image_data = block.metadata.get('image_data')
@@ -193,9 +189,9 @@ class MarkdownFormatter:
else:
src = block.metadata.get('src', '')
alt = block.metadata.get('alt', '')
return f"![{alt}]({src})\n"
def _format_cross_ref(self, block: ContentBlock) -> str:
"""Format cross-reference block."""
xref_data = block.metadata.get('xref_data')
@@ -203,13 +199,13 @@ class MarkdownFormatter:
text = xref_data.text or xref_data.target
target = xref_data.target
return f"[{text}](#{target})\n"
return block.content + '\n'
def _format_admonition(self, block: ContentBlock) -> str:
"""Format admonition/callout block."""
admonition_type = block.metadata.get('admonition_type', AdmonitionType.NOTE)
# GitHub-style admonitions
type_map = {
AdmonitionType.NOTE: 'NOTE',
@@ -218,16 +214,16 @@ class MarkdownFormatter:
AdmonitionType.IMPORTANT: 'IMPORTANT',
AdmonitionType.CAUTION: 'CAUTION',
}
type_str = type_map.get(admonition_type, 'NOTE')
content = block.content
return f"> [!{type_str}]\n> {content.replace(chr(10), chr(10) + '> ')}\n"
def _format_directive(self, block: ContentBlock) -> str:
"""Format directive block (RST-specific)."""
directive_name = block.metadata.get('directive_name', 'unknown')
# Format as a blockquote with directive name
content = block.content
lines = [f"> **{directive_name}**"]
@@ -235,13 +231,13 @@ class MarkdownFormatter:
lines.append(f"> {line}")
lines.append('')
return '\n'.join(lines)
def _format_field_list(self, block: ContentBlock) -> str:
"""Format field list block."""
fields = block.metadata.get('fields', [])
if not fields:
return block.content + '\n'
lines = []
for field in fields:
if field.arg:
@@ -250,13 +246,13 @@ class MarkdownFormatter:
lines.append(f"**{field.name}**: {field.content}")
lines.append('')
return '\n'.join(lines)
def _format_definition_list(self, block: ContentBlock) -> str:
"""Format definition list block."""
items = block.metadata.get('items', [])
if not items:
return block.content + '\n'
lines = []
for item in items:
if item.classifier:
@@ -266,7 +262,7 @@ class MarkdownFormatter:
lines.append(f": {item.definition}")
lines.append('')
return '\n'.join(lines)
def _format_meta(self, block: ContentBlock) -> str:
"""Format metadata block (usually filtered out)."""
return '' # Metadata goes in YAML frontmatter
@@ -274,7 +270,7 @@ class MarkdownFormatter:
class SkillFormatter:
"""Format Document for skill-seekers internal use."""
def format(self, document: Document) -> dict[str, Any]:
"""Format document for skill output."""
return {
@@ -324,7 +320,7 @@ class SkillFormatter:
"processing_time_ms": document.stats.processing_time_ms,
}
}
def _extract_summary(self, document: Document, max_length: int = 500) -> str:
"""Extract a text summary from the document."""
paragraphs = []
@@ -333,22 +329,22 @@ class SkillFormatter:
paragraphs.append(block.content)
if len(' '.join(paragraphs)) > max_length:
break
summary = ' '.join(paragraphs)
if len(summary) > max_length:
summary = summary[:max_length - 3] + '...'
return summary
def _score_table(self, table: Table) -> float:
"""Quick table quality score."""
if not table.rows:
return 0.0
score = 5.0
if table.headers:
score += 2.0
if 2 <= len(table.rows) <= 50:
score += 1.0
return min(10.0, score)

View File

@@ -17,13 +17,12 @@ Enhanced with quality scoring and table support.
"""
import re
from pathlib import Path
from typing import Any, Optional
from typing import Any
from .base_parser import BaseParser
from .unified_structure import (
Document, ContentBlock, ContentBlockType, CrossReference, CrossRefType,
AdmonitionType, Heading, CodeBlock, Table, Image, ListType, ExtractionStats
AdmonitionType, Heading, CodeBlock, Table, Image, ListType
)
from .quality_scorer import QualityScorer
@@ -31,10 +30,10 @@ from .quality_scorer import QualityScorer
class MarkdownParser(BaseParser):
"""
Parser for Markdown documents.
Supports standard Markdown and GitHub-flavored Markdown (GFM).
"""
# Admonition types for GitHub-style callouts
ADMONITION_TYPES = {
'note': AdmonitionType.NOTE,
@@ -46,21 +45,21 @@ class MarkdownParser(BaseParser):
'danger': AdmonitionType.DANGER,
'attention': AdmonitionType.ATTENTION,
}
def __init__(self, options: Optional[dict[str, Any]] = None):
def __init__(self, options: dict[str, Any] | None = None):
super().__init__(options)
self.quality_scorer = QualityScorer()
self._lines: list[str] = []
self._current_line = 0
@property
def format_name(self) -> str:
return 'markdown'
@property
def supported_extensions(self) -> list[str]:
return ['.md', '.markdown', '.mdown', '.mkd']
def _detect_format(self, content: str) -> bool:
"""Detect if content is Markdown."""
md_indicators = [
@@ -71,34 +70,31 @@ class MarkdownParser(BaseParser):
r'^\s*[-*+]\s+\S', # Lists
r'^>\s+\S', # Blockquotes
]
for pattern in md_indicators:
if re.search(pattern, content, re.MULTILINE):
return True
return False
return any(re.search(pattern, content, re.MULTILINE) for pattern in md_indicators)
def _parse_content(self, content: str, source_path: str) -> Document:
"""Parse Markdown content into Document."""
self._lines = content.split('\n')
self._current_line = 0
document = Document(
title='',
format='markdown',
source_path=source_path,
)
# Parse frontmatter if present
frontmatter = self._parse_frontmatter()
if frontmatter:
document.meta.update(frontmatter)
# Parse content blocks
while self._current_line < len(self._lines):
block = self._parse_block()
if block:
document.blocks.append(block)
self._current_line += 1
# Extract title from first h1 or frontmatter
if document.meta.get('title'):
document.title = document.meta['title']
@@ -109,55 +105,55 @@ class MarkdownParser(BaseParser):
if heading_data and heading_data.level == 1:
document.title = heading_data.text
break
# Extract specialized content
self._extract_specialized_content(document)
return document
def _parse_frontmatter(self) -> Optional[dict]:
def _parse_frontmatter(self) -> dict | None:
"""Parse YAML frontmatter if present."""
if self._current_line >= len(self._lines):
return None
first_line = self._lines[self._current_line].strip()
if first_line != '---':
return None
# Find closing ---
end_line = None
for i in range(self._current_line + 1, len(self._lines)):
if self._lines[i].strip() == '---':
end_line = i
break
if end_line is None:
return None
# Extract frontmatter content
frontmatter_lines = self._lines[self._current_line + 1:end_line]
frontmatter_content = '\n'.join(frontmatter_lines)
'\n'.join(frontmatter_lines)
# Simple key: value parsing (not full YAML)
meta = {}
current_key = None
current_value = []
for line in frontmatter_lines:
stripped = line.strip()
if not stripped:
continue
# Check for new key
match = re.match(r'^(\w+):\s*(.*)$', stripped)
if match:
# Save previous key
if current_key:
meta[current_key] = '\n'.join(current_value).strip()
current_key = match.group(1)
value = match.group(2)
# Handle inline value
if value:
# Check if it's a list
@@ -178,146 +174,146 @@ class MarkdownParser(BaseParser):
meta[current_key].append(stripped[2:].strip().strip('"\''))
elif current_key:
current_value.append(stripped)
# Save last key
if current_key:
meta[current_key] = '\n'.join(current_value).strip()
# Advance past frontmatter
self._current_line = end_line + 1
return meta
def _parse_block(self) -> Optional[ContentBlock]:
def _parse_block(self) -> ContentBlock | None:
"""Parse a single block at current position."""
line = self._current_line
if line >= len(self._lines):
return None
current = self._lines[line]
stripped = current.strip()
# Skip empty lines
if not stripped:
return None
# Skip HTML comments
if stripped.startswith('<!--'):
return self._parse_html_comment()
# ATX Headers
if stripped.startswith('#'):
return self._parse_atx_header()
# Setext headers (underline style)
if self._is_setext_header(line):
return self._parse_setext_header()
# Code fence
if stripped.startswith('```'):
return self._parse_code_fence()
# Indented code block
if current.startswith(' ') or current.startswith('\t'):
return self._parse_indented_code()
# Table
if '|' in stripped and self._is_table(line):
return self._parse_table()
# Blockquote (check for admonition)
if stripped.startswith('>'):
return self._parse_blockquote()
# Horizontal rule
if re.match(r'^[\-*_]{3,}\s*$', stripped):
return self._parse_horizontal_rule()
# List
list_type = self._detect_list_type(stripped)
if list_type:
return self._parse_list(list_type)
# Paragraph (default)
return self._parse_paragraph()
def _is_setext_header(self, line: int) -> bool:
"""Check if current line is a Setext header."""
if line + 1 >= len(self._lines):
return False
current = self._lines[line].strip()
next_line = self._lines[line + 1].strip()
if not current or not next_line:
return False
# H1: ===, H2: ---
return re.match(r'^[=-]+$', next_line) is not None
def _parse_atx_header(self) -> ContentBlock:
"""Parse ATX style header (# Header)."""
line = self._lines[self._current_line]
match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
if match:
level = len(match.group(1))
text = match.group(2).strip()
# Remove trailing hashes
text = re.sub(r'\s+#+$', '', text)
anchor = self._create_anchor(text)
heading = Heading(
level=level,
text=text,
id=anchor,
source_line=self._current_line + 1,
)
return ContentBlock(
type=ContentBlockType.HEADING,
content=text,
metadata={'heading_data': heading},
source_line=self._current_line + 1,
)
return self._parse_paragraph()
def _parse_setext_header(self) -> ContentBlock:
"""Parse Setext style header (underline)."""
text = self._lines[self._current_line].strip()
underline = self._lines[self._current_line + 1].strip()
level = 1 if underline[0] == '=' else 2
anchor = self._create_anchor(text)
heading = Heading(
level=level,
text=text,
id=anchor,
source_line=self._current_line + 1,
)
# Skip underline
self._current_line += 1
return ContentBlock(
type=ContentBlockType.HEADING,
content=text,
metadata={'heading_data': heading},
source_line=self._current_line,
)
def _parse_code_fence(self) -> ContentBlock:
"""Parse fenced code block."""
line = self._lines[self._current_line]
match = re.match(r'^```(\w+)?\s*$', line.strip())
language = match.group(1) if match else None
start_line = self._current_line
self._current_line += 1
code_lines = []
while self._current_line < len(self._lines):
current_line = self._lines[self._current_line]
@@ -325,19 +321,19 @@ class MarkdownParser(BaseParser):
break
code_lines.append(current_line)
self._current_line += 1
code = '\n'.join(code_lines)
# Detect language if not specified
detected_lang, confidence = self.quality_scorer.detect_language(code)
if not language and confidence > 0.6:
language = detected_lang
elif not language:
language = 'text'
# Score code quality
quality = self.quality_scorer.score_code_block(code, language)
code_block = CodeBlock(
code=code,
language=language,
@@ -345,7 +341,7 @@ class MarkdownParser(BaseParser):
confidence=confidence if language == detected_lang else 1.0,
source_line=start_line + 1,
)
return ContentBlock(
type=ContentBlockType.CODE_BLOCK,
content=code,
@@ -356,19 +352,19 @@ class MarkdownParser(BaseParser):
source_line=start_line + 1,
quality_score=quality,
)
def _parse_indented_code(self) -> ContentBlock:
"""Parse indented code block."""
code_lines = []
start_line = self._current_line
while self._current_line < len(self._lines):
line = self._lines[self._current_line]
if not line.strip():
code_lines.append('')
self._current_line += 1
continue
if line.startswith(' '):
code_lines.append(line[4:])
elif line.startswith('\t'):
@@ -376,15 +372,15 @@ class MarkdownParser(BaseParser):
else:
self._current_line -= 1
break
self._current_line += 1
code = '\n'.join(code_lines).rstrip()
# Detect language
detected_lang, confidence = self.quality_scorer.detect_language(code)
quality = self.quality_scorer.score_code_block(code, detected_lang)
code_block = CodeBlock(
code=code,
language=detected_lang if confidence > 0.6 else 'text',
@@ -392,7 +388,7 @@ class MarkdownParser(BaseParser):
confidence=confidence,
source_line=start_line + 1,
)
return ContentBlock(
type=ContentBlockType.CODE_BLOCK,
content=code,
@@ -403,52 +399,49 @@ class MarkdownParser(BaseParser):
source_line=start_line + 1,
quality_score=quality,
)
def _is_table(self, line: int) -> bool:
"""Check if current position is a table."""
if line + 1 >= len(self._lines):
return False
current = self._lines[line].strip()
next_line = self._lines[line + 1].strip()
# Check for table separator line
if re.match(r'^[\|:-]+$', next_line) and '|' in current:
return True
return False
return bool(re.match(r'^[\|:-]+$', next_line) and '|' in current)
def _parse_table(self) -> ContentBlock:
"""Parse a GFM table."""
rows = []
headers = None
start_line = self._current_line
# Parse header row
header_line = self._lines[self._current_line].strip()
headers = [cell.strip() for cell in header_line.split('|')]
headers = [h for h in headers if h] # Remove empty
self._current_line += 1
# Skip separator line (|:--:| etc.)
if self._current_line < len(self._lines):
self._current_line += 1
# Parse data rows
while self._current_line < len(self._lines):
line = self._lines[self._current_line].strip()
if not line or '|' not in line:
self._current_line -= 1
break
cells = [cell.strip() for cell in line.split('|')]
cells = [c for c in cells if c]
if cells:
rows.append(cells)
self._current_line += 1
table = Table(
rows=rows,
headers=headers,
@@ -456,9 +449,9 @@ class MarkdownParser(BaseParser):
source_format='markdown',
source_line=start_line + 1,
)
quality = self.quality_scorer.score_table(table)
return ContentBlock(
type=ContentBlockType.TABLE,
content=f"[Table: {len(rows)} rows]",
@@ -466,25 +459,25 @@ class MarkdownParser(BaseParser):
source_line=start_line + 1,
quality_score=quality,
)
def _parse_blockquote(self) -> ContentBlock:
"""Parse a blockquote, checking for admonitions."""
lines = []
start_line = self._current_line
admonition_type = None
admonition_content = []
while self._current_line < len(self._lines):
line = self._lines[self._current_line]
stripped = line.strip()
if not stripped.startswith('>'):
self._current_line -= 1
break
# Remove > prefix
content = line[1:].strip() if line.startswith('> ') else line[1:].strip()
# Check for GitHub-style admonition: > [!NOTE]
admonition_match = re.match(r'^\[!([\w]+)\]\s*(.*)$', content)
if admonition_match and not admonition_type:
@@ -497,9 +490,9 @@ class MarkdownParser(BaseParser):
admonition_content.append(content)
else:
lines.append(content)
self._current_line += 1
# Return as admonition if detected
if admonition_type:
return ContentBlock(
@@ -508,7 +501,7 @@ class MarkdownParser(BaseParser):
metadata={'admonition_type': admonition_type},
source_line=start_line + 1,
)
# Regular blockquote
content = '\n'.join(lines)
return ContentBlock(
@@ -517,24 +510,23 @@ class MarkdownParser(BaseParser):
metadata={'block_type': 'blockquote'},
source_line=start_line + 1,
)
def _parse_html_comment(self) -> Optional[ContentBlock]:
def _parse_html_comment(self) -> ContentBlock | None:
"""Parse HTML comment (usually skip)."""
start_line = self._current_line
content_lines = []
while self._current_line < len(self._lines):
line = self._lines[self._current_line]
content_lines.append(line)
if '-->' in line:
break
self._current_line += 1
# Skip comments in output (could optionally include)
return None
def _parse_horizontal_rule(self) -> ContentBlock:
"""Parse horizontal rule."""
return ContentBlock(
@@ -543,28 +535,28 @@ class MarkdownParser(BaseParser):
metadata={'element': 'horizontal_rule'},
source_line=self._current_line + 1,
)
def _detect_list_type(self, stripped: str) -> Optional[ListType]:
def _detect_list_type(self, stripped: str) -> ListType | None:
"""Detect if line starts a list and which type."""
if re.match(r'^[-*+]\s+', stripped):
return ListType.BULLET
if re.match(r'^\d+\.\s+', stripped):
return ListType.NUMBERED
return None
def _parse_list(self, list_type: ListType) -> ContentBlock:
"""Parse a list."""
items = []
start_line = self._current_line
while self._current_line < len(self._lines):
line = self._lines[self._current_line]
stripped = line.strip()
if not stripped:
self._current_line += 1
continue
# Check if still in list
if list_type == ListType.BULLET:
match = re.match(r'^[-*+]\s+(.+)$', stripped)
@@ -578,9 +570,9 @@ class MarkdownParser(BaseParser):
self._current_line -= 1
break
items.append(match.group(1))
self._current_line += 1
return ContentBlock(
type=ContentBlockType.LIST,
content=f"{len(items)} items",
@@ -590,20 +582,20 @@ class MarkdownParser(BaseParser):
},
source_line=start_line + 1,
)
def _parse_paragraph(self) -> ContentBlock:
"""Parse a paragraph."""
lines = []
start_line = self._current_line
while self._current_line < len(self._lines):
line = self._lines[self._current_line]
stripped = line.strip()
# End of paragraph
if not stripped:
break
# Check for block-level elements
if stripped.startswith('#'):
break
@@ -619,45 +611,45 @@ class MarkdownParser(BaseParser):
break
if self._is_setext_header(self._current_line):
break
lines.append(stripped)
self._current_line += 1
content = ' '.join(lines)
# Process inline elements
content = self._process_inline(content)
return ContentBlock(
type=ContentBlockType.PARAGRAPH,
content=content,
source_line=start_line + 1,
)
def _process_inline(self, text: str) -> str:
"""Process inline Markdown elements."""
# Links [text](url)
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\1](\2)', text)
# Images ![alt](url)
text = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', r'![\1](\2)', text)
# Code `code`
text = re.sub(r'`([^`]+)`', r'`\1`', text)
# Bold **text** or __text__
text = re.sub(r'\*\*([^*]+)\*\*', r'**\1**', text)
text = re.sub(r'__([^_]+)__', r'**\1**', text)
# Italic *text* or _text_
text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'*\1*', text)
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'*\1*', text)
# Strikethrough ~~text~~
text = re.sub(r'~~([^~]+)~~', r'~~\1~~', text)
return text
def _create_anchor(self, text: str) -> str:
"""Create URL anchor from heading text."""
anchor = text.lower()
@@ -665,7 +657,7 @@ class MarkdownParser(BaseParser):
anchor = anchor.replace(' ', '-')
anchor = re.sub(r'-+', '-', anchor)
return anchor.strip('-')
def _extract_specialized_content(self, document: Document):
"""Extract specialized content lists from blocks."""
for block in document.blocks:
@@ -674,19 +666,19 @@ class MarkdownParser(BaseParser):
heading_data = block.metadata.get('heading_data')
if heading_data:
document.headings.append(heading_data)
# Extract code blocks
elif block.type == ContentBlockType.CODE_BLOCK:
code_data = block.metadata.get('code_data')
if code_data:
document.code_blocks.append(code_data)
# Extract tables
elif block.type == ContentBlockType.TABLE:
table_data = block.metadata.get('table_data')
if table_data:
document.tables.append(table_data)
# Extract images from paragraphs (simplified)
elif block.type == ContentBlockType.PARAGRAPH:
content = block.content
@@ -698,7 +690,7 @@ class MarkdownParser(BaseParser):
source_line=block.source_line,
)
document.images.append(image)
# Extract links
link_matches = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
for text, url in link_matches:
@@ -709,14 +701,14 @@ class MarkdownParser(BaseParser):
ref_type = CrossRefType.EXTERNAL
else:
ref_type = CrossRefType.INTERNAL
xref = CrossReference(
ref_type=ref_type,
target=url,
text=text,
source_line=block.source_line,
)
if ref_type == CrossRefType.EXTERNAL:
document.external_links.append(xref)
else:

View File

@@ -5,7 +5,7 @@ Wraps PDFExtractor to provide unified Document output.
"""
from pathlib import Path
from typing import Any, Optional
from typing import Any
from .base_parser import BaseParser, ParseResult
from .quality_scorer import QualityScorer
@@ -14,7 +14,6 @@ from .unified_structure import (
ContentBlock,
ContentBlockType,
Document,
ExtractionStats,
Heading,
Image,
Table,
@@ -33,13 +32,13 @@ except ImportError:
class PdfParser(BaseParser):
"""
Parser for PDF documents.
Wraps the existing PDFExtractor to provide unified Document output
while maintaining all PDF-specific features (OCR, image extraction,
table extraction, etc.).
"""
def __init__(self, options: Optional[dict[str, Any]] = None):
def __init__(self, options: dict[str, Any] | None = None):
super().__init__(options)
self.pdf_options = {
"verbose": self.options.get("verbose", False),
@@ -71,7 +70,7 @@ class PdfParser(BaseParser):
def _parse_content(self, content: str, source_path: str) -> Document:
"""
Parse PDF content into Document.
Note: For PDF, we need the file path, not content string.
This method is mainly for API compatibility.
"""
@@ -83,10 +82,10 @@ class PdfParser(BaseParser):
def parse_file(self, path: str | Path) -> ParseResult:
"""
Parse a PDF file.
Args:
path: Path to PDF file
Returns:
ParseResult with Document or error info
"""
@@ -97,7 +96,7 @@ class PdfParser(BaseParser):
result.errors.append(f"File not found: {path}")
return result
if not path.suffix.lower() == ".pdf":
if path.suffix.lower() != ".pdf":
result.errors.append(f"Not a PDF file: {path}")
return result
@@ -127,7 +126,7 @@ class PdfParser(BaseParser):
# Convert to unified Document
document = self._convert_to_document(extraction_result, str(path))
result.document = document
result.success = True
result.warnings.extend(document.stats.warnings)
@@ -157,13 +156,13 @@ class PdfParser(BaseParser):
# Process pages
pages = extraction_result.get("pages", [])
for page_num, page_data in enumerate(pages):
# Add page heading
page_heading = f"Page {page_num + 1}"
if page_data.get("headings"):
page_heading = page_data["headings"][0].get("text", page_heading)
document.blocks.append(
ContentBlock(
type=ContentBlockType.HEADING,
@@ -200,7 +199,7 @@ class PdfParser(BaseParser):
source_line=page_num + 1,
)
document.code_blocks.append(code_block)
document.blocks.append(
ContentBlock(
type=ContentBlockType.CODE_BLOCK,
@@ -224,7 +223,7 @@ class PdfParser(BaseParser):
source_line=page_num + 1,
)
document.tables.append(table)
quality = self.quality_scorer.score_table(table)
document.blocks.append(
ContentBlock(
@@ -268,7 +267,7 @@ class PdfParser(BaseParser):
def parse(self, source: str | Path) -> ParseResult:
"""
Parse PDF from source.
For PDF files, source should be a file path.
"""
if isinstance(source, str) and Path(source).exists():

View File

@@ -8,14 +8,13 @@ Provides consistent quality scoring across all parsers for:
"""
import re
from typing import Optional
from .unified_structure import CodeBlock, Table, ContentBlock
from .unified_structure import Table, ContentBlock
class QualityScorer:
"""Score the quality of extracted content."""
# Language patterns for detection and validation
LANGUAGE_PATTERNS = {
'python': {
@@ -122,26 +121,26 @@ class QualityScorer:
],
},
}
def score_code_block(self, code: str, language: Optional[str] = None) -> float:
def score_code_block(self, code: str, language: str | None = None) -> float:
"""
Score a code block for quality (0-10).
Args:
code: The code content
language: Detected or specified language
Returns:
Quality score from 0-10
"""
score = 5.0 # Start neutral
if not code or not code.strip():
return 0.0
code = code.strip()
lines = [l for l in code.split('\n') if l.strip()]
lines = [line for line in code.split('\n') if line.strip()]
# Factor 1: Length appropriateness
code_len = len(code)
if 50 <= code_len <= 1000:
@@ -150,22 +149,22 @@ class QualityScorer:
score -= 1.0 # Too long
elif code_len < 20:
score -= 2.0 # Too short
# Factor 2: Line count
if 3 <= len(lines) <= 50:
score += 1.0
elif len(lines) > 100:
score -= 0.5
# Factor 3: Language-specific validation
if language and language in self.LANGUAGE_PATTERNS:
lang_patterns = self.LANGUAGE_PATTERNS[language]
# Check for keywords
keyword_matches = sum(1 for kw in lang_patterns['keywords'] if kw in code)
if keyword_matches >= 2:
score += 1.0
# Check for syntax patterns
syntax_matches = sum(
1 for pattern, _ in lang_patterns['syntax_checks']
@@ -173,27 +172,27 @@ class QualityScorer:
)
if syntax_matches >= 1:
score += 1.0
# Factor 4: Structural quality
# Check for function/class definitions
if re.search(r'\b(def|function|func|fn|class|public class)\b', code):
score += 1.5
# Check for meaningful variable names (not just x, y, i)
meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower())
if len(meaningful_vars) >= 3:
score += 0.5
# Factor 5: Syntax validation (generic)
is_valid, issues = self._validate_syntax(code, language)
if is_valid:
score += 1.0
else:
score -= len(issues) * 0.3
# Factor 6: Comment/code ratio
comment_lines = sum(
1 for line in lines
1 for line in lines
if line.strip().startswith(('#', '//', '/*', '*', '--', '<!--'))
)
if len(lines) > 0:
@@ -202,14 +201,14 @@ class QualityScorer:
score += 0.5 # Good comment ratio
elif comment_ratio > 0.6:
score -= 1.0 # Too many comments
# Clamp to 0-10
return max(0.0, min(10.0, score))
def _validate_syntax(self, code: str, language: Optional[str]) -> tuple[bool, list[str]]:
def _validate_syntax(self, code: str, language: str | None) -> tuple[bool, list[str]]:
"""Basic syntax validation."""
issues = []
# Check for balanced braces/brackets
pairs = [('{', '}'), ('[', ']'), ('(', ')')]
for open_char, close_char in pairs:
@@ -217,13 +216,13 @@ class QualityScorer:
close_count = code.count(close_char)
if abs(open_count - close_count) > 2:
issues.append(f"Unbalanced {open_char}{close_char}")
# Check for common natural language indicators
common_words = ['the', 'and', 'for', 'with', 'this', 'that', 'have', 'from', 'they']
word_count = sum(1 for word in common_words if f' {word} ' in code.lower())
if word_count > 5 and len(code.split()) < 100:
issues.append("May be natural language")
# Language-specific checks
if language == 'python':
# Check for mixed indentation
@@ -235,32 +234,32 @@ class QualityScorer:
indent_chars.add('tab')
if len(indent_chars) > 1:
issues.append("Mixed tabs and spaces")
elif language == 'json':
try:
import json
json.loads(code)
except Exception as e:
issues.append(f"Invalid JSON: {str(e)[:50]}")
return len(issues) == 0, issues
def score_table(self, table: Table) -> float:
"""
Score a table for quality (0-10).
Args:
table: The table to score
Returns:
Quality score from 0-10
"""
score = 5.0
# Factor 1: Has headers
if table.headers:
score += 1.0
# Factor 2: Consistent column count
if table.rows:
col_counts = [len(row) for row in table.rows]
@@ -268,18 +267,18 @@ class QualityScorer:
score += 1.0 # Consistent
else:
score -= 1.0 # Inconsistent
# Factor 3: Reasonable size
if 2 <= table.num_rows <= 100:
score += 0.5
elif table.num_rows > 500:
score -= 0.5
if 2 <= table.num_cols <= 10:
score += 0.5
elif table.num_cols > 20:
score -= 0.5
# Factor 4: Non-empty cells
if table.rows:
total_cells = sum(len(row) for row in table.rows)
@@ -290,72 +289,69 @@ class QualityScorer:
score += 1.0
elif empty_ratio > 0.5:
score -= 1.0
# Factor 5: Has caption (good for API docs)
if table.caption:
score += 0.5
return max(0.0, min(10.0, score))
def score_content_block(self, block: ContentBlock) -> float:
"""Score a generic content block."""
score = 5.0
content = block.content
if not content:
return 0.0
# Length check
if len(content) < 10:
score -= 2.0
elif len(content) > 1000:
score += 0.5
# Structure check
if '.' in content: # Has sentences
score += 0.5
if content[0].isupper(): # Starts with capital
score += 0.5
return max(0.0, min(10.0, score))
def detect_language(self, code: str) -> tuple[str, float]:
"""
Detect programming language from code.
Returns:
Tuple of (language, confidence)
"""
code = code.strip()
if not code:
return 'unknown', 0.0
scores = {}
for lang, patterns in self.LANGUAGE_PATTERNS.items():
score = 0.0
# Check keywords
keyword_hits = sum(1 for kw in patterns['keywords'] if kw in code)
score += keyword_hits * 0.5
# Check syntax patterns
for pattern, _ in patterns['syntax_checks']:
if re.search(pattern, code, re.MULTILINE):
score += 1.0
scores[lang] = score
if not scores:
return 'unknown', 0.0
best_lang = max(scores, key=scores.get)
best_score = scores[best_lang]
# Normalize confidence
if best_score >= 3:
confidence = min(1.0, best_score / 5)
else:
confidence = best_score / 10
confidence = min(1.0, best_score / 5) if best_score >= 3 else best_score / 10
return best_lang, confidence

File diff suppressed because it is too large Load Diff

View File

@@ -7,8 +7,8 @@ with a consistent structure.
"""
from dataclasses import dataclass, field
from typing import Any, Optional
from enum import Enum, auto
from typing import Any
from enum import Enum
class ContentBlockType(Enum):
@@ -76,20 +76,20 @@ class Heading:
"""A document heading/section title."""
level: int # 1-6 for h1-h6, or 1+ for RST underline levels
text: str
id: Optional[str] = None # Anchor ID
source_line: Optional[int] = None
id: str | None = None # Anchor ID
source_line: int | None = None
@dataclass
class CodeBlock:
"""A code block with metadata."""
code: str
language: Optional[str] = None
quality_score: Optional[float] = None # 0-10
confidence: Optional[float] = None # Language detection confidence
is_valid: Optional[bool] = None # Syntax validation result
language: str | None = None
quality_score: float | None = None # 0-10
confidence: float | None = None # Language detection confidence
is_valid: bool | None = None # Syntax validation result
validation_issues: list[str] = field(default_factory=list)
source_line: Optional[int] = None
source_line: int | None = None
metadata: dict[str, Any] = field(default_factory=dict)
@@ -97,11 +97,11 @@ class CodeBlock:
class Table:
"""A table with rows and cells."""
rows: list[list[str]] # 2D array of cell content
headers: Optional[list[str]] = None
caption: Optional[str] = None
col_widths: Optional[list[int]] = None
headers: list[str] | None = None
caption: str | None = None
col_widths: list[int] | None = None
source_format: str = "unknown" # 'simple', 'grid', 'list-table', 'markdown', 'pdf'
source_line: Optional[int] = None
source_line: int | None = None
metadata: dict[str, Any] = field(default_factory=dict)
@property
@@ -120,8 +120,8 @@ class CrossReference:
"""A cross-reference link."""
ref_type: CrossRefType
target: str # Target ID, URL, or path
text: Optional[str] = None # Display text (if different from target)
source_line: Optional[int] = None
text: str | None = None # Display text (if different from target)
source_line: int | None = None
resolved: bool = False # Whether target was resolved
@@ -129,9 +129,9 @@ class CrossReference:
class Field:
"""A field in a field list (RST :param:, :returns:, etc.)."""
name: str # Field name (e.g., 'param', 'returns', 'type')
arg: Optional[str] = None # Field argument (e.g., parameter name)
arg: str | None = None # Field argument (e.g., parameter name)
content: str = "" # Field content
source_line: Optional[int] = None
source_line: int | None = None
@dataclass
@@ -139,19 +139,19 @@ class DefinitionItem:
"""A definition list item (term + definition)."""
term: str
definition: str
classifier: Optional[str] = None # RST classifier (term : classifier)
source_line: Optional[int] = None
classifier: str | None = None # RST classifier (term : classifier)
source_line: int | None = None
@dataclass
class Image:
"""An image reference or embedded image."""
source: str # URL, path, or base64 data
alt_text: Optional[str] = None
width: Optional[int] = None
height: Optional[int] = None
alt_text: str | None = None
width: int | None = None
height: int | None = None
is_embedded: bool = False # True if data is embedded
source_line: Optional[int] = None
source_line: int | None = None
@dataclass
@@ -160,8 +160,8 @@ class ContentBlock:
type: ContentBlockType
content: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
source_line: Optional[int] = None
quality_score: Optional[float] = None # 0-10
source_line: int | None = None
quality_score: float | None = None # 0-10
# Type-specific data (stored in metadata for flexibility)
# For CODE_BLOCK: 'code_data' -> CodeBlock
@@ -183,71 +183,71 @@ class ExtractionStats:
cross_references: int = 0
images: int = 0
warnings: list[str] = field(default_factory=list)
processing_time_ms: Optional[float] = None
processing_time_ms: float | None = None
@dataclass
class Document:
"""
Unified document structure - output of ALL parsers.
This class provides a standardized representation of document content
regardless of the source format (RST, Markdown, PDF, HTML).
"""
title: str = ""
format: str = "" # 'markdown', 'rst', 'pdf', 'html', 'unknown'
source_path: str = ""
# Core content as blocks
blocks: list[ContentBlock] = field(default_factory=list)
# Navigation/Structure (derived from blocks for convenience)
headings: list[Heading] = field(default_factory=list)
sections: list[dict] = field(default_factory=list) # Hierarchical structure
# References
internal_links: list[CrossReference] = field(default_factory=list)
external_links: list[CrossReference] = field(default_factory=list)
# Specialized content (also in blocks, but extracted for easy access)
code_blocks: list[CodeBlock] = field(default_factory=list)
tables: list[Table] = field(default_factory=list)
images: list[Image] = field(default_factory=list)
# RST-specific (may be empty for other formats)
field_lists: list[list[Field]] = field(default_factory=list)
definition_lists: list[list[DefinitionItem]] = field(default_factory=list)
substitutions: dict[str, str] = field(default_factory=dict)
toc_trees: list[list[str]] = field(default_factory=list)
# Metadata
meta: dict[str, Any] = field(default_factory=dict)
# Extraction info
stats: ExtractionStats = field(default_factory=ExtractionStats)
def to_markdown(self, options: Optional[dict] = None) -> str:
def to_markdown(self, options: dict | None = None) -> str:
"""
Convert unified structure to markdown output.
Args:
options: Optional formatting options
- include_toc: bool = False
- max_heading_level: int = 6
- code_block_style: str = 'fenced' # or 'indented'
- table_style: str = 'github' # or 'simple'
Returns:
Markdown-formatted string
"""
from .formatters import MarkdownFormatter
formatter = MarkdownFormatter(options or {})
return formatter.format(self)
def to_skill_format(self) -> dict[str, Any]:
"""
Convert to skill-seekers internal format.
Returns:
Dictionary compatible with existing skill-seekers pipelines
"""
@@ -292,7 +292,7 @@ class Document:
"headings": self.stats.headings,
}
}
def _extract_content_text(self) -> str:
"""Extract plain text content from paragraphs."""
paragraphs = []
@@ -300,21 +300,21 @@ class Document:
if block.type == ContentBlockType.PARAGRAPH:
paragraphs.append(block.content)
return "\n\n".join(paragraphs)
def get_section_content(self, heading_text: str) -> list[ContentBlock]:
"""
Get all content blocks under a specific section heading.
Args:
heading_text: The section heading to find
Returns:
List of ContentBlock objects in that section
"""
result = []
in_section = False
section_level = None
for block in self.blocks:
if block.type == ContentBlockType.HEADING:
heading_data = block.metadata.get('heading_data')
@@ -325,29 +325,29 @@ class Document:
elif in_section and heading_data.level <= section_level:
# New section at same or higher level
break
if in_section:
result.append(block)
return result
def find_blocks_by_type(self, block_type: ContentBlockType) -> list[ContentBlock]:
"""Find all blocks of a specific type."""
return [b for b in self.blocks if b.type == block_type]
def find_code_by_language(self, language: str) -> list[CodeBlock]:
"""Find all code blocks in a specific language."""
return [cb for cb in self.code_blocks if cb.language == language]
def find_tables_by_caption(self, pattern: str) -> list[Table]:
"""Find tables with captions matching a pattern."""
import re
return [t for t in self.tables if t.caption and re.search(pattern, t.caption, re.I)]
def get_api_summary(self) -> dict[str, Any]:
"""
Extract API summary if this is API documentation.
Returns:
Dictionary with 'properties', 'methods', 'signals', etc.
"""
@@ -355,7 +355,7 @@ class Document:
properties_table = None
methods_table = None
signals_table = None
for table in self.tables:
if table.caption:
cap_lower = table.caption.lower()
@@ -365,21 +365,21 @@ class Document:
methods_table = table
elif 'signal' in cap_lower:
signals_table = table
return {
"properties": self._parse_api_table(properties_table) if properties_table else [],
"methods": self._parse_api_table(methods_table) if methods_table else [],
"signals": self._parse_api_table(signals_table) if signals_table else [],
}
def _parse_api_table(self, table: Optional[Table]) -> list[dict]:
def _parse_api_table(self, table: Table | None) -> list[dict]:
"""Parse an API table into structured data."""
if not table or not table.rows:
return []
results = []
headers = table.headers or []
for row in table.rows:
if len(row) >= 2:
item = {"name": row[0]}
@@ -387,25 +387,25 @@ class Document:
if i < len(row):
item[header.lower().replace(' ', '_')] = row[i]
results.append(item)
return results
def merge_documents(docs: list[Document]) -> Document:
"""
Merge multiple documents into one.
Useful for combining multiple source files into a single skill.
"""
if not docs:
return Document()
merged = Document(
title=docs[0].title,
format=docs[0].format,
source_path="merged",
)
for doc in docs:
merged.blocks.extend(doc.blocks)
merged.headings.extend(doc.headings)
@@ -418,12 +418,12 @@ def merge_documents(docs: list[Document]) -> Document:
merged.definition_lists.extend(doc.definition_lists)
merged.toc_trees.extend(doc.toc_trees)
merged.meta.update(doc.meta)
# Merge stats
merged.stats.total_blocks = sum(d.stats.total_blocks for d in docs)
merged.stats.code_blocks = sum(d.stats.code_blocks for d in docs)
merged.stats.tables = sum(d.stats.tables for d in docs)
merged.stats.headings = sum(d.stats.headings for d in docs)
merged.stats.cross_references = sum(d.stats.cross_references for d in docs)
return merged

View File

@@ -707,14 +707,14 @@ def main():
# Note: Runs independently of workflow system (they complement each other)
if getattr(args, "enhance_level", 0) > 0:
# Traditional AI enhancement (API or LOCAL mode)
logger.info("\n" + "=" * 80)
logger.info("🤖 Traditional AI Enhancement")
logger.info("=" * 80)
print("\n" + "=" * 80)
print("🤖 Traditional AI Enhancement")
print("=" * 80)
if workflow_executed:
logger.info(f" Running after workflow: {workflow_name}")
logger.info(" (Workflow provides specialized analysis, enhancement provides general improvements)")
logger.info(" (Use --enhance-workflow for more control)")
logger.info("")
print(f" Running after workflow: {workflow_name}")
print(" (Workflow provides specialized analysis, enhancement provides general improvements)")
print(" (Use --enhance-workflow for more control)")
print("")
# Note: PDF scraper uses enhance_level instead of enhance/enhance_local
# This is consistent with the new unified enhancement system

View File

@@ -25,7 +25,7 @@ import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Literal
from typing import Literal
logger = logging.getLogger(__name__)
@@ -166,10 +166,7 @@ class UnifiedEnhancer:
return items
# Get appropriate prompt
if custom_prompt:
prompt_template = custom_prompt
else:
prompt_template = self._get_default_prompt(enhancement_type)
prompt_template = custom_prompt or self._get_default_prompt(enhancement_type)
# Batch processing
batch_size = (

View File

@@ -571,7 +571,7 @@ class UnifiedScraper:
if file_patterns:
logger.info(f" File patterns: {', '.join(file_patterns)}")
results = analyze_codebase(
analyze_codebase(
directory=Path(local_path),
output_dir=temp_output,
depth=analysis_depth,

View File

@@ -91,7 +91,7 @@ def _validate_yaml(text: str) -> dict:
# ──────────────────────────────────────────────────────────────────────────────
def list_workflows_tool(args: dict) -> list:
def list_workflows_tool(_args: dict) -> list:
"""Return all workflows with name, description, and source."""
result: list[dict[str, str]] = []

View File

@@ -122,28 +122,28 @@ class TestCreateCommandArgvForwarding:
def _make_args(self, **kwargs):
import argparse
defaults = dict(
enhance_workflow=None,
enhance_stage=None,
var=None,
workflow_dry_run=False,
enhance_level=0,
output=None,
name=None,
description=None,
config=None,
api_key=None,
dry_run=False,
verbose=False,
quiet=False,
chunk_for_rag=False,
chunk_size=512,
chunk_overlap=50,
preset=None,
no_preserve_code_blocks=False,
no_preserve_paragraphs=False,
interactive_enhancement=False,
)
defaults = {
"enhance_workflow": None,
"enhance_stage": None,
"var": None,
"workflow_dry_run": False,
"enhance_level": 0,
"output": None,
"name": None,
"description": None,
"config": None,
"api_key": None,
"dry_run": False,
"verbose": False,
"quiet": False,
"chunk_for_rag": False,
"chunk_size": 512,
"chunk_overlap": 50,
"preset": None,
"no_preserve_code_blocks": False,
"no_preserve_paragraphs": False,
"interactive_enhancement": False,
}
defaults.update(kwargs)
return argparse.Namespace(**defaults)

View File

@@ -86,7 +86,7 @@ Basic usage:
.. code-block:: gdscript
extends Node
func _ready():
print("Hello, World!")
position = Vector2(100, 100)
@@ -414,7 +414,7 @@ def calculate_average(numbers):
def test_good_table_score(self):
"""Test quality score for good table."""
from skill_seekers.cli.parsers.extractors import QualityScorer, Table
from skill_seekers.cli.parsers.extractors import QualityScorer
scorer = QualityScorer()
good_table = Table(

View File

@@ -12,8 +12,7 @@ Covers:
"""
import argparse
import sys
from unittest.mock import MagicMock, patch, call
from unittest.mock import MagicMock, patch
import pytest
@@ -186,7 +185,7 @@ class TestRunWorkflowsMultiple:
m.workflow.description = "desc"
m.workflow.stages = []
# Track call order
m.run.side_effect = lambda *a, _n=wf_name, **kw: run_order.append(_n)
m.run.side_effect = lambda *_a, _n=wf_name, **_kw: run_order.append(_n)
engines.append(m)
with patch(
@@ -208,7 +207,7 @@ class TestRunWorkflowsMultiple:
good_engine.workflow.description = "desc"
good_engine.workflow.stages = []
def side_effect(name, **kwargs):
def side_effect(name, **_kwargs):
if name == "bad-workflow":
raise FileNotFoundError("not found")
return good_engine
@@ -341,9 +340,8 @@ class TestRunWorkflowsDryRun:
with patch(
"skill_seekers.cli.enhancement_workflow.WorkflowEngine",
return_value=mock_engine,
):
with pytest.raises(SystemExit) as exc:
run_workflows(args)
), pytest.raises(SystemExit) as exc:
run_workflows(args)
assert exc.value.code == 0
mock_engine.preview.assert_called_once()
@@ -366,9 +364,8 @@ class TestRunWorkflowsDryRun:
with patch(
"skill_seekers.cli.enhancement_workflow.WorkflowEngine",
side_effect=engines,
):
with pytest.raises(SystemExit):
run_workflows(args)
), pytest.raises(SystemExit):
run_workflows(args)
for engine in engines:
engine.preview.assert_called_once()

View File

@@ -9,7 +9,6 @@ Covers:
"""
import textwrap
from pathlib import Path
from unittest.mock import patch
import pytest
@@ -290,7 +289,7 @@ class TestDeleteWorkflowTool:
wf.write_text(MINIMAL_YAML, encoding="utf-8")
with _mock_bundled_names([]):
result = delete_workflow_tool({"name": "my-wf"})
delete_workflow_tool({"name": "my-wf"})
assert not wf.exists()

View File

@@ -10,11 +10,9 @@ Covers:
"""
import textwrap
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
import yaml
# Import the MODULE object (not just individual symbols) so we can patch it
# directly via patch.object(). This survives any sys.modules manipulation by
@@ -168,9 +166,8 @@ class TestCmdCopy:
assert dest.read_text(encoding="utf-8") == MINIMAL_YAML
def test_copy_nonexistent(self, capsys, tmp_user_dir):
with _mock_bundled_text({}):
with _mock_bundled([]):
rc = cmd_copy(["ghost-workflow"])
with _mock_bundled_text({}), _mock_bundled([]):
rc = cmd_copy(["ghost-workflow"])
assert rc == 1
assert "not found" in capsys.readouterr().err.lower()
@@ -403,9 +400,8 @@ class TestMain:
from skill_seekers.cli.workflows_command import main
# tmp_user_dir is empty; mock bundled to return nothing
with _mock_bundled([]):
with pytest.raises(SystemExit) as exc:
main(["list"])
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
main(["list"])
assert exc.value.code == 0
def test_main_validate_success(self, capsys, sample_yaml_file):
@@ -423,31 +419,27 @@ class TestMain:
assert "name: test-workflow" in capsys.readouterr().out
def test_main_show_not_found_exits_1(self, capsys, tmp_user_dir):
with patch.object(_wf_cmd, "_workflow_yaml_text", return_value=None):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["show", "ghost"])
with patch.object(_wf_cmd, "_workflow_yaml_text", return_value=None), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["show", "ghost"])
assert exc.value.code == 1
def test_main_copy_single(self, capsys, tmp_user_dir):
with _mock_bundled_text({"default": MINIMAL_YAML}):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "default"])
with _mock_bundled_text({"default": MINIMAL_YAML}), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "default"])
assert exc.value.code == 0
assert (tmp_user_dir / "default.yaml").exists()
def test_main_copy_multiple(self, capsys, tmp_user_dir):
texts = {"default": MINIMAL_YAML, "minimal": MINIMAL_YAML}
with _mock_bundled_text(texts):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "default", "minimal"])
with _mock_bundled_text(texts), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "default", "minimal"])
assert exc.value.code == 0
assert (tmp_user_dir / "default.yaml").exists()
assert (tmp_user_dir / "minimal.yaml").exists()
def test_main_copy_not_found_exits_1(self, capsys, tmp_user_dir):
with _mock_bundled_text({}), _mock_bundled([]):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "ghost"])
with _mock_bundled_text({}), _mock_bundled([]), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["copy", "ghost"])
assert exc.value.code == 1
def test_main_add_single_file(self, capsys, tmp_user_dir, sample_yaml_file):
@@ -484,32 +476,28 @@ class TestMain:
def test_main_remove_single(self, capsys, tmp_user_dir):
(tmp_user_dir / "my-wf.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
with _mock_bundled([]):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "my-wf"])
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "my-wf"])
assert exc.value.code == 0
assert not (tmp_user_dir / "my-wf.yaml").exists()
def test_main_remove_multiple(self, capsys, tmp_user_dir):
(tmp_user_dir / "wf-a.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
(tmp_user_dir / "wf-b.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
with _mock_bundled([]):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "wf-a", "wf-b"])
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "wf-a", "wf-b"])
assert exc.value.code == 0
assert not (tmp_user_dir / "wf-a.yaml").exists()
assert not (tmp_user_dir / "wf-b.yaml").exists()
def test_main_remove_bundled_refused(self, capsys, tmp_user_dir):
with _mock_bundled(["default"]):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "default"])
with _mock_bundled(["default"]), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "default"])
assert exc.value.code == 1
def test_main_remove_not_found_exits_1(self, capsys, tmp_user_dir):
with _mock_bundled([]):
with pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "ghost"])
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
_wf_cmd.main(["remove", "ghost"])
assert exc.value.code == 1