fix: resolve all ruff linting errors (W293, F401, B904, UP007, UP045, E741, SIM102, SIM117, ARG)
Auto-fixed (whitespace, imports, type annotations): - codebase_scraper.py: W293 blank lines with whitespace - doc_scraper.py: W293 blank lines with whitespace - parsers/extractors/__init__.py: W293 - parsers/extractors/base_parser.py: W293, UP007, UP045, F401 Manual fixes: - enhancement_workflow.py: B904 raise without `from exc`, remove unused `os` import - parsers/extractors/quality_scorer.py: E741 ambiguous var `l` → `line` - parsers/extractors/rst_parser.py: SIM102 nested if → combined conditions (x2) - pdf_scraper.py: F821 undefined `logger` → `print()` (consistent with file style) - mcp/tools/workflow_tools.py: ARG001 unused `args` → `_args` - tests/test_workflow_runner.py: ARG005 unused lambda args → `_a`/`_kw`, ARG001 `kwargs` → `_kwargs` - tests/test_workflows_command.py: SIM117 nested with → combined with (x2) All 1922 tests pass. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -444,7 +444,7 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
def extract_rst_structure(content: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract structure from ReStructuredText (RST) content.
|
||||
|
||||
|
||||
Uses the enhanced unified RST parser for comprehensive extraction.
|
||||
|
||||
RST uses underline-style headers:
|
||||
@@ -474,13 +474,13 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
|
||||
# Use the enhanced unified RST parser
|
||||
try:
|
||||
from skill_seekers.cli.parsers.extractors import RstParser
|
||||
|
||||
|
||||
parser = RstParser()
|
||||
result = parser.parse_string(content, "<string>")
|
||||
|
||||
|
||||
if result.success and result.document:
|
||||
doc = result.document
|
||||
|
||||
|
||||
# Convert to legacy structure format for backward compatibility
|
||||
structure = {
|
||||
"title": doc.title,
|
||||
@@ -531,7 +531,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
|
||||
except Exception as e:
|
||||
# Fall back to basic extraction if unified parser fails
|
||||
logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
|
||||
|
||||
|
||||
# Legacy basic extraction (fallback)
|
||||
import re
|
||||
|
||||
|
||||
@@ -401,13 +401,13 @@ class DocToSkillConverter:
|
||||
# Try enhanced unified parser first
|
||||
try:
|
||||
from skill_seekers.cli.parsers.extractors import MarkdownParser
|
||||
|
||||
|
||||
parser = MarkdownParser()
|
||||
result = parser.parse_string(content, url)
|
||||
|
||||
|
||||
if result.success and result.document:
|
||||
doc = result.document
|
||||
|
||||
|
||||
# Extract links from the document
|
||||
links = []
|
||||
for link in doc.external_links:
|
||||
@@ -421,7 +421,7 @@ class DocToSkillConverter:
|
||||
full_url = full_url.split("#")[0]
|
||||
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
|
||||
links.append(full_url)
|
||||
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": doc.title or "",
|
||||
|
||||
@@ -24,7 +24,6 @@ Usage:
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from importlib.resources import files as importlib_files
|
||||
@@ -145,11 +144,11 @@ class WorkflowEngine:
|
||||
pkg_ref = importlib_files("skill_seekers.workflows").joinpath(bare_name)
|
||||
yaml_text = pkg_ref.read_text(encoding="utf-8")
|
||||
logger.info(f"📋 Loading bundled workflow: {bare_name}")
|
||||
except (FileNotFoundError, TypeError, ModuleNotFoundError):
|
||||
except (FileNotFoundError, TypeError, ModuleNotFoundError) as exc:
|
||||
raise FileNotFoundError(
|
||||
f"Workflow '{yaml_ref.stem}' not found. "
|
||||
"Use 'skill-seekers workflows list' to see available workflows."
|
||||
)
|
||||
) from exc
|
||||
|
||||
if resolved_path is not None:
|
||||
logger.info(f"📋 Loading workflow: {resolved_path}")
|
||||
|
||||
@@ -6,20 +6,20 @@ a standardized Document structure.
|
||||
|
||||
Usage:
|
||||
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
|
||||
|
||||
|
||||
# Parse RST file
|
||||
parser = RstParser()
|
||||
result = parser.parse_file("docs/class_node.rst")
|
||||
|
||||
|
||||
if result.success:
|
||||
doc = result.document
|
||||
print(f"Title: {doc.title}")
|
||||
print(f"Tables: {len(doc.tables)}")
|
||||
print(f"Code blocks: {len(doc.code_blocks)}")
|
||||
|
||||
|
||||
# Convert to markdown
|
||||
markdown = doc.to_markdown()
|
||||
|
||||
|
||||
# Convert to skill format
|
||||
skill_data = doc.to_skill_format()
|
||||
|
||||
@@ -29,7 +29,7 @@ Available Parsers:
|
||||
|
||||
Auto-Detection:
|
||||
from skill_seekers.cli.parsers.extractors import parse_document
|
||||
|
||||
|
||||
# Automatically detects format
|
||||
result = parse_document("file.rst")
|
||||
"""
|
||||
|
||||
@@ -8,11 +8,11 @@ and implement the same interface for consistent usage.
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
import time
|
||||
import logging
|
||||
|
||||
from .unified_structure import Document, ExtractionStats
|
||||
from .unified_structure import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -20,11 +20,11 @@ logger = logging.getLogger(__name__)
|
||||
@dataclass
|
||||
class ParseResult:
|
||||
"""Result of parsing a document."""
|
||||
document: Optional[Document] = None
|
||||
document: Document | None = None
|
||||
success: bool = False
|
||||
errors: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@property
|
||||
def is_ok(self) -> bool:
|
||||
"""Check if parsing succeeded."""
|
||||
@@ -34,18 +34,18 @@ class ParseResult:
|
||||
class BaseParser(ABC):
|
||||
"""
|
||||
Abstract base class for all document parsers.
|
||||
|
||||
|
||||
Implementations:
|
||||
- RstParser: ReStructuredText documents
|
||||
- MarkdownParser: Markdown documents
|
||||
- PdfParser: PDF documents
|
||||
- HtmlParser: HTML documents (future)
|
||||
"""
|
||||
|
||||
def __init__(self, options: Optional[dict[str, Any]] = None):
|
||||
|
||||
def __init__(self, options: dict[str, Any] | None = None):
|
||||
"""
|
||||
Initialize parser with options.
|
||||
|
||||
|
||||
Args:
|
||||
options: Parser-specific options
|
||||
Common options:
|
||||
@@ -61,26 +61,26 @@ class BaseParser(ABC):
|
||||
self._quality_scoring = self.options.get('quality_scoring', True)
|
||||
self._max_file_size = self.options.get('max_file_size_mb', 50.0) * 1024 * 1024
|
||||
self._encoding = self.options.get('encoding', 'utf-8')
|
||||
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def format_name(self) -> str:
|
||||
"""Return the format name this parser handles."""
|
||||
pass
|
||||
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supported_extensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
pass
|
||||
|
||||
def can_parse(self, source: Union[str, Path]) -> bool:
|
||||
|
||||
def can_parse(self, source: str | Path) -> bool:
|
||||
"""
|
||||
Check if this parser can handle the given source.
|
||||
|
||||
|
||||
Args:
|
||||
source: File path or content string
|
||||
|
||||
|
||||
Returns:
|
||||
True if this parser can handle the source
|
||||
"""
|
||||
@@ -95,58 +95,58 @@ class BaseParser(ABC):
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
def parse(self, source: Union[str, Path]) -> ParseResult:
|
||||
|
||||
def parse(self, source: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse a document from file path or content string.
|
||||
|
||||
|
||||
Args:
|
||||
source: File path (str/Path) or content string
|
||||
|
||||
|
||||
Returns:
|
||||
ParseResult with document or error info
|
||||
"""
|
||||
start_time = time.time()
|
||||
result = ParseResult()
|
||||
|
||||
|
||||
try:
|
||||
# Read source
|
||||
content, source_path = self._read_source_with_path(source)
|
||||
|
||||
|
||||
# Check file size
|
||||
if len(content.encode(self._encoding)) > self._max_file_size:
|
||||
result.errors.append(f"File too large: {source_path}")
|
||||
return result
|
||||
|
||||
|
||||
# Validate format
|
||||
if not self._detect_format(content):
|
||||
result.warnings.append(f"Content may not be valid {self.format_name}")
|
||||
|
||||
|
||||
# Parse content
|
||||
document = self._parse_content(content, source_path)
|
||||
|
||||
|
||||
# Post-process
|
||||
document = self._post_process(document)
|
||||
|
||||
|
||||
# Record stats
|
||||
processing_time = (time.time() - start_time) * 1000
|
||||
if document.stats:
|
||||
document.stats.processing_time_ms = processing_time
|
||||
|
||||
|
||||
result.document = document
|
||||
result.success = True
|
||||
result.warnings.extend(document.stats.warnings)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
result.errors.append(f"Parse error: {str(e)}")
|
||||
logger.exception(f"Error parsing {source}")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
def parse_file(self, path: Union[str, Path]) -> ParseResult:
|
||||
|
||||
def parse_file(self, path: str | Path) -> ParseResult:
|
||||
"""Parse a file from path."""
|
||||
return self.parse(path)
|
||||
|
||||
|
||||
def parse_string(self, content: str, source_path: str = "<string>") -> ParseResult:
|
||||
"""Parse content from string."""
|
||||
# Create a wrapper that looks like a path
|
||||
@@ -160,46 +160,46 @@ class BaseParser(ABC):
|
||||
return True
|
||||
def __str__(self):
|
||||
return self._path
|
||||
|
||||
|
||||
source = StringSource(content, source_path)
|
||||
result = self.parse(source)
|
||||
if result.document:
|
||||
result.document.source_path = source_path
|
||||
return result
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def _parse_content(self, content: str, source_path: str) -> Document:
|
||||
"""
|
||||
Parse content string into Document.
|
||||
|
||||
|
||||
Args:
|
||||
content: Raw content to parse
|
||||
source_path: Original source path (for reference)
|
||||
|
||||
|
||||
Returns:
|
||||
Parsed Document
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def _detect_format(self, content: str) -> bool:
|
||||
"""
|
||||
Detect if content matches this parser's format.
|
||||
|
||||
|
||||
Args:
|
||||
content: Content to check
|
||||
|
||||
|
||||
Returns:
|
||||
True if content appears to be this format
|
||||
"""
|
||||
pass
|
||||
|
||||
def _read_source(self, source: Union[str, Path]) -> str:
|
||||
|
||||
def _read_source(self, source: str | Path) -> str:
|
||||
"""Read content from source."""
|
||||
content, _ = self._read_source_with_path(source)
|
||||
return content
|
||||
|
||||
def _read_source_with_path(self, source: Union[str, Path]) -> tuple[str, str]:
|
||||
|
||||
def _read_source_with_path(self, source: str | Path) -> tuple[str, str]:
|
||||
"""Read content and return with path."""
|
||||
if isinstance(source, str):
|
||||
# Check if it's a path or content
|
||||
@@ -214,37 +214,37 @@ class BaseParser(ABC):
|
||||
else:
|
||||
# Assume it's a file-like object
|
||||
return source.read_text(encoding=self._encoding), str(source)
|
||||
|
||||
|
||||
def _post_process(self, document: Document) -> Document:
|
||||
"""
|
||||
Post-process document after parsing.
|
||||
|
||||
|
||||
Override to add cross-references, validate, etc.
|
||||
"""
|
||||
# Build heading list from blocks
|
||||
if not document.headings:
|
||||
document.headings = self._extract_headings(document)
|
||||
|
||||
|
||||
# Extract code blocks from blocks
|
||||
if not document.code_blocks:
|
||||
document.code_blocks = self._extract_code_blocks(document)
|
||||
|
||||
|
||||
# Extract tables from blocks
|
||||
if not document.tables:
|
||||
document.tables = self._extract_tables(document)
|
||||
|
||||
|
||||
# Update stats
|
||||
document.stats.total_blocks = len(document.blocks)
|
||||
document.stats.code_blocks = len(document.code_blocks)
|
||||
document.stats.tables = len(document.tables)
|
||||
document.stats.headings = len(document.headings)
|
||||
document.stats.cross_references = len(document.internal_links) + len(document.external_links)
|
||||
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def _extract_headings(self, document: Document) -> list:
|
||||
"""Extract headings from content blocks."""
|
||||
from .unified_structure import ContentBlockType, Heading
|
||||
from .unified_structure import ContentBlockType
|
||||
headings = []
|
||||
for block in document.blocks:
|
||||
if block.type == ContentBlockType.HEADING:
|
||||
@@ -252,7 +252,7 @@ class BaseParser(ABC):
|
||||
if heading_data:
|
||||
headings.append(heading_data)
|
||||
return headings
|
||||
|
||||
|
||||
def _extract_code_blocks(self, document: Document) -> list:
|
||||
"""Extract code blocks from content blocks."""
|
||||
code_blocks = []
|
||||
@@ -260,7 +260,7 @@ class BaseParser(ABC):
|
||||
if block.metadata.get('code_data'):
|
||||
code_blocks.append(block.metadata['code_data'])
|
||||
return code_blocks
|
||||
|
||||
|
||||
def _extract_tables(self, document: Document) -> list:
|
||||
"""Extract tables from content blocks."""
|
||||
tables = []
|
||||
@@ -268,7 +268,7 @@ class BaseParser(ABC):
|
||||
if block.metadata.get('table_data'):
|
||||
tables.append(block.metadata['table_data'])
|
||||
return tables
|
||||
|
||||
|
||||
def _create_quality_scorer(self):
|
||||
"""Create a quality scorer if enabled."""
|
||||
if self._quality_scoring:
|
||||
@@ -277,44 +277,44 @@ class BaseParser(ABC):
|
||||
return None
|
||||
|
||||
|
||||
def get_parser_for_file(path: Union[str, Path]) -> Optional[BaseParser]:
|
||||
def get_parser_for_file(path: str | Path) -> BaseParser | None:
|
||||
"""
|
||||
Get the appropriate parser for a file.
|
||||
|
||||
|
||||
Args:
|
||||
path: File path
|
||||
|
||||
|
||||
Returns:
|
||||
Appropriate parser instance or None
|
||||
"""
|
||||
path = Path(path)
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
|
||||
# Try RST parser
|
||||
from .rst_parser import RstParser
|
||||
rst_parser = RstParser()
|
||||
if suffix in rst_parser.supported_extensions:
|
||||
return rst_parser
|
||||
|
||||
|
||||
# Try Markdown parser
|
||||
from .markdown_parser import MarkdownParser
|
||||
md_parser = MarkdownParser()
|
||||
if suffix in md_parser.supported_extensions:
|
||||
return md_parser
|
||||
|
||||
|
||||
# Could add PDF, HTML parsers here
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_document(source: Union[str, Path], format_hint: Optional[str] = None) -> ParseResult:
|
||||
def parse_document(source: str | Path, format_hint: str | None = None) -> ParseResult:
|
||||
"""
|
||||
Parse a document, auto-detecting the format.
|
||||
|
||||
|
||||
Args:
|
||||
source: File path or content string
|
||||
format_hint: Optional format hint ('rst', 'markdown', etc.)
|
||||
|
||||
|
||||
Returns:
|
||||
ParseResult
|
||||
"""
|
||||
@@ -326,21 +326,21 @@ def parse_document(source: Union[str, Path], format_hint: Optional[str] = None)
|
||||
elif format_hint.lower() in ('md', 'markdown'):
|
||||
from .markdown_parser import MarkdownParser
|
||||
return MarkdownParser().parse(source)
|
||||
|
||||
|
||||
# Auto-detect from file extension
|
||||
parser = get_parser_for_file(source)
|
||||
if parser:
|
||||
return parser.parse(source)
|
||||
|
||||
|
||||
# Try content-based detection
|
||||
content = source if isinstance(source, str) else Path(source).read_text()
|
||||
|
||||
|
||||
# Check for RST indicators
|
||||
rst_indicators = ['.. ', '::\n', ':ref:`', '.. toctree::', '.. code-block::']
|
||||
if any(ind in content for ind in rst_indicators):
|
||||
from .rst_parser import RstParser
|
||||
return RstParser().parse_string(content)
|
||||
|
||||
|
||||
# Default to Markdown
|
||||
from .markdown_parser import MarkdownParser
|
||||
return MarkdownParser().parse_string(content)
|
||||
|
||||
@@ -7,45 +7,44 @@ Convert unified Document structure to various output formats.
|
||||
from typing import Any
|
||||
|
||||
from .unified_structure import (
|
||||
Document, ContentBlock, ContentBlockType, CrossRefType,
|
||||
AdmonitionType, ListType, Table, CodeBlock
|
||||
Document, ContentBlock, ContentBlockType, AdmonitionType, ListType, Table
|
||||
)
|
||||
|
||||
|
||||
class MarkdownFormatter:
|
||||
"""Format Document as Markdown."""
|
||||
|
||||
|
||||
def __init__(self, options: dict[str, Any] = None):
|
||||
self.options = options or {}
|
||||
self.include_toc = self.options.get('include_toc', False)
|
||||
self.max_heading_level = self.options.get('max_heading_level', 6)
|
||||
self.code_block_style = self.options.get('code_block_style', 'fenced')
|
||||
self.table_style = self.options.get('table_style', 'github')
|
||||
|
||||
|
||||
def format(self, document: Document) -> str:
|
||||
"""Convert document to markdown string."""
|
||||
parts = []
|
||||
|
||||
|
||||
# Title
|
||||
if document.title:
|
||||
parts.append(f"# {document.title}\n")
|
||||
|
||||
|
||||
# Metadata as YAML frontmatter
|
||||
if document.meta:
|
||||
parts.append(self._format_metadata(document.meta))
|
||||
|
||||
|
||||
# Table of contents
|
||||
if self.include_toc and document.headings:
|
||||
parts.append(self._format_toc(document.headings))
|
||||
|
||||
|
||||
# Content blocks
|
||||
for block in document.blocks:
|
||||
formatted = self._format_block(block)
|
||||
if formatted:
|
||||
parts.append(formatted)
|
||||
|
||||
|
||||
return '\n'.join(parts)
|
||||
|
||||
|
||||
def _format_metadata(self, meta: dict) -> str:
|
||||
"""Format metadata as YAML frontmatter."""
|
||||
lines = ['---']
|
||||
@@ -58,7 +57,7 @@ class MarkdownFormatter:
|
||||
lines.append(f"{key}: {value}")
|
||||
lines.append('---\n')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_toc(self, headings: list) -> str:
|
||||
"""Format table of contents."""
|
||||
lines = ['## Table of Contents\n']
|
||||
@@ -69,7 +68,7 @@ class MarkdownFormatter:
|
||||
lines.append(f"{indent}- [{h.text}](#{anchor})")
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_block(self, block: ContentBlock) -> str:
|
||||
"""Format a single content block."""
|
||||
handlers = {
|
||||
@@ -86,14 +85,14 @@ class MarkdownFormatter:
|
||||
ContentBlockType.DEFINITION_LIST: self._format_definition_list,
|
||||
ContentBlockType.META: self._format_meta,
|
||||
}
|
||||
|
||||
|
||||
handler = handlers.get(block.type)
|
||||
if handler:
|
||||
return handler(block)
|
||||
|
||||
|
||||
# Default: return content as-is
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
def _format_heading(self, block: ContentBlock) -> str:
|
||||
"""Format heading block."""
|
||||
heading_data = block.metadata.get('heading_data')
|
||||
@@ -103,87 +102,84 @@ class MarkdownFormatter:
|
||||
else:
|
||||
level = block.metadata.get('level', 1)
|
||||
text = block.content
|
||||
|
||||
|
||||
if level > self.max_heading_level:
|
||||
return f"**{text}**\n"
|
||||
|
||||
|
||||
return f"{'#' * level} {text}\n"
|
||||
|
||||
|
||||
def _format_paragraph(self, block: ContentBlock) -> str:
|
||||
"""Format paragraph block."""
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
def _format_code_block(self, block: ContentBlock) -> str:
|
||||
"""Format code block."""
|
||||
code_data = block.metadata.get('code_data')
|
||||
|
||||
|
||||
if code_data:
|
||||
code = code_data.code
|
||||
lang = code_data.language or ''
|
||||
else:
|
||||
code = block.content
|
||||
lang = block.metadata.get('language', '')
|
||||
|
||||
|
||||
if self.code_block_style == 'fenced':
|
||||
return f"```{lang}\n{code}\n```\n"
|
||||
else:
|
||||
# Indented style
|
||||
indented = '\n'.join(' ' + line for line in code.split('\n'))
|
||||
return indented + '\n'
|
||||
|
||||
|
||||
def _format_table(self, block: ContentBlock) -> str:
|
||||
"""Format table block."""
|
||||
table_data = block.metadata.get('table_data')
|
||||
if not table_data:
|
||||
return ''
|
||||
|
||||
|
||||
return self._format_table_data(table_data)
|
||||
|
||||
|
||||
def _format_table_data(self, table: Table) -> str:
|
||||
"""Format table data as markdown."""
|
||||
if not table.rows:
|
||||
return ''
|
||||
|
||||
|
||||
lines = []
|
||||
|
||||
|
||||
# Caption
|
||||
if table.caption:
|
||||
lines.append(f"**{table.caption}**\n")
|
||||
|
||||
|
||||
# Headers
|
||||
headers = table.headers or table.rows[0]
|
||||
lines.append('| ' + ' | '.join(headers) + ' |')
|
||||
lines.append('|' + '|'.join('---' for _ in headers) + '|')
|
||||
|
||||
|
||||
# Rows (skip first if used as headers)
|
||||
start_row = 0 if table.headers else 1
|
||||
for row in table.rows[start_row:]:
|
||||
# Pad row to match header count
|
||||
padded_row = row + [''] * (len(headers) - len(row))
|
||||
lines.append('| ' + ' | '.join(padded_row[:len(headers)]) + ' |')
|
||||
|
||||
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_list(self, block: ContentBlock) -> str:
|
||||
"""Format list block."""
|
||||
list_type = block.metadata.get('list_type', ListType.BULLET)
|
||||
items = block.metadata.get('items', [])
|
||||
|
||||
|
||||
if not items:
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
lines = []
|
||||
for i, item in enumerate(items):
|
||||
if list_type == ListType.NUMBERED:
|
||||
prefix = f"{i + 1}."
|
||||
else:
|
||||
prefix = "-"
|
||||
prefix = f"{i + 1}." if list_type == ListType.NUMBERED else "-"
|
||||
lines.append(f"{prefix} {item}")
|
||||
|
||||
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_image(self, block: ContentBlock) -> str:
|
||||
"""Format image block."""
|
||||
image_data = block.metadata.get('image_data')
|
||||
@@ -193,9 +189,9 @@ class MarkdownFormatter:
|
||||
else:
|
||||
src = block.metadata.get('src', '')
|
||||
alt = block.metadata.get('alt', '')
|
||||
|
||||
|
||||
return f"\n"
|
||||
|
||||
|
||||
def _format_cross_ref(self, block: ContentBlock) -> str:
|
||||
"""Format cross-reference block."""
|
||||
xref_data = block.metadata.get('xref_data')
|
||||
@@ -203,13 +199,13 @@ class MarkdownFormatter:
|
||||
text = xref_data.text or xref_data.target
|
||||
target = xref_data.target
|
||||
return f"[{text}](#{target})\n"
|
||||
|
||||
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
def _format_admonition(self, block: ContentBlock) -> str:
|
||||
"""Format admonition/callout block."""
|
||||
admonition_type = block.metadata.get('admonition_type', AdmonitionType.NOTE)
|
||||
|
||||
|
||||
# GitHub-style admonitions
|
||||
type_map = {
|
||||
AdmonitionType.NOTE: 'NOTE',
|
||||
@@ -218,16 +214,16 @@ class MarkdownFormatter:
|
||||
AdmonitionType.IMPORTANT: 'IMPORTANT',
|
||||
AdmonitionType.CAUTION: 'CAUTION',
|
||||
}
|
||||
|
||||
|
||||
type_str = type_map.get(admonition_type, 'NOTE')
|
||||
content = block.content
|
||||
|
||||
|
||||
return f"> [!{type_str}]\n> {content.replace(chr(10), chr(10) + '> ')}\n"
|
||||
|
||||
|
||||
def _format_directive(self, block: ContentBlock) -> str:
|
||||
"""Format directive block (RST-specific)."""
|
||||
directive_name = block.metadata.get('directive_name', 'unknown')
|
||||
|
||||
|
||||
# Format as a blockquote with directive name
|
||||
content = block.content
|
||||
lines = [f"> **{directive_name}**"]
|
||||
@@ -235,13 +231,13 @@ class MarkdownFormatter:
|
||||
lines.append(f"> {line}")
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_field_list(self, block: ContentBlock) -> str:
|
||||
"""Format field list block."""
|
||||
fields = block.metadata.get('fields', [])
|
||||
if not fields:
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
lines = []
|
||||
for field in fields:
|
||||
if field.arg:
|
||||
@@ -250,13 +246,13 @@ class MarkdownFormatter:
|
||||
lines.append(f"**{field.name}**: {field.content}")
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_definition_list(self, block: ContentBlock) -> str:
|
||||
"""Format definition list block."""
|
||||
items = block.metadata.get('items', [])
|
||||
if not items:
|
||||
return block.content + '\n'
|
||||
|
||||
|
||||
lines = []
|
||||
for item in items:
|
||||
if item.classifier:
|
||||
@@ -266,7 +262,7 @@ class MarkdownFormatter:
|
||||
lines.append(f": {item.definition}")
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _format_meta(self, block: ContentBlock) -> str:
|
||||
"""Format metadata block (usually filtered out)."""
|
||||
return '' # Metadata goes in YAML frontmatter
|
||||
@@ -274,7 +270,7 @@ class MarkdownFormatter:
|
||||
|
||||
class SkillFormatter:
|
||||
"""Format Document for skill-seekers internal use."""
|
||||
|
||||
|
||||
def format(self, document: Document) -> dict[str, Any]:
|
||||
"""Format document for skill output."""
|
||||
return {
|
||||
@@ -324,7 +320,7 @@ class SkillFormatter:
|
||||
"processing_time_ms": document.stats.processing_time_ms,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _extract_summary(self, document: Document, max_length: int = 500) -> str:
|
||||
"""Extract a text summary from the document."""
|
||||
paragraphs = []
|
||||
@@ -333,22 +329,22 @@ class SkillFormatter:
|
||||
paragraphs.append(block.content)
|
||||
if len(' '.join(paragraphs)) > max_length:
|
||||
break
|
||||
|
||||
|
||||
summary = ' '.join(paragraphs)
|
||||
if len(summary) > max_length:
|
||||
summary = summary[:max_length - 3] + '...'
|
||||
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def _score_table(self, table: Table) -> float:
|
||||
"""Quick table quality score."""
|
||||
if not table.rows:
|
||||
return 0.0
|
||||
|
||||
|
||||
score = 5.0
|
||||
if table.headers:
|
||||
score += 2.0
|
||||
if 2 <= len(table.rows) <= 50:
|
||||
score += 1.0
|
||||
|
||||
|
||||
return min(10.0, score)
|
||||
|
||||
@@ -17,13 +17,12 @@ Enhanced with quality scoring and table support.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from .unified_structure import (
|
||||
Document, ContentBlock, ContentBlockType, CrossReference, CrossRefType,
|
||||
AdmonitionType, Heading, CodeBlock, Table, Image, ListType, ExtractionStats
|
||||
AdmonitionType, Heading, CodeBlock, Table, Image, ListType
|
||||
)
|
||||
from .quality_scorer import QualityScorer
|
||||
|
||||
@@ -31,10 +30,10 @@ from .quality_scorer import QualityScorer
|
||||
class MarkdownParser(BaseParser):
|
||||
"""
|
||||
Parser for Markdown documents.
|
||||
|
||||
|
||||
Supports standard Markdown and GitHub-flavored Markdown (GFM).
|
||||
"""
|
||||
|
||||
|
||||
# Admonition types for GitHub-style callouts
|
||||
ADMONITION_TYPES = {
|
||||
'note': AdmonitionType.NOTE,
|
||||
@@ -46,21 +45,21 @@ class MarkdownParser(BaseParser):
|
||||
'danger': AdmonitionType.DANGER,
|
||||
'attention': AdmonitionType.ATTENTION,
|
||||
}
|
||||
|
||||
def __init__(self, options: Optional[dict[str, Any]] = None):
|
||||
|
||||
def __init__(self, options: dict[str, Any] | None = None):
|
||||
super().__init__(options)
|
||||
self.quality_scorer = QualityScorer()
|
||||
self._lines: list[str] = []
|
||||
self._current_line = 0
|
||||
|
||||
|
||||
@property
|
||||
def format_name(self) -> str:
|
||||
return 'markdown'
|
||||
|
||||
|
||||
@property
|
||||
def supported_extensions(self) -> list[str]:
|
||||
return ['.md', '.markdown', '.mdown', '.mkd']
|
||||
|
||||
|
||||
def _detect_format(self, content: str) -> bool:
|
||||
"""Detect if content is Markdown."""
|
||||
md_indicators = [
|
||||
@@ -71,34 +70,31 @@ class MarkdownParser(BaseParser):
|
||||
r'^\s*[-*+]\s+\S', # Lists
|
||||
r'^>\s+\S', # Blockquotes
|
||||
]
|
||||
for pattern in md_indicators:
|
||||
if re.search(pattern, content, re.MULTILINE):
|
||||
return True
|
||||
return False
|
||||
|
||||
return any(re.search(pattern, content, re.MULTILINE) for pattern in md_indicators)
|
||||
|
||||
def _parse_content(self, content: str, source_path: str) -> Document:
|
||||
"""Parse Markdown content into Document."""
|
||||
self._lines = content.split('\n')
|
||||
self._current_line = 0
|
||||
|
||||
|
||||
document = Document(
|
||||
title='',
|
||||
format='markdown',
|
||||
source_path=source_path,
|
||||
)
|
||||
|
||||
|
||||
# Parse frontmatter if present
|
||||
frontmatter = self._parse_frontmatter()
|
||||
if frontmatter:
|
||||
document.meta.update(frontmatter)
|
||||
|
||||
|
||||
# Parse content blocks
|
||||
while self._current_line < len(self._lines):
|
||||
block = self._parse_block()
|
||||
if block:
|
||||
document.blocks.append(block)
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
# Extract title from first h1 or frontmatter
|
||||
if document.meta.get('title'):
|
||||
document.title = document.meta['title']
|
||||
@@ -109,55 +105,55 @@ class MarkdownParser(BaseParser):
|
||||
if heading_data and heading_data.level == 1:
|
||||
document.title = heading_data.text
|
||||
break
|
||||
|
||||
|
||||
# Extract specialized content
|
||||
self._extract_specialized_content(document)
|
||||
|
||||
|
||||
return document
|
||||
|
||||
def _parse_frontmatter(self) -> Optional[dict]:
|
||||
|
||||
def _parse_frontmatter(self) -> dict | None:
|
||||
"""Parse YAML frontmatter if present."""
|
||||
if self._current_line >= len(self._lines):
|
||||
return None
|
||||
|
||||
|
||||
first_line = self._lines[self._current_line].strip()
|
||||
if first_line != '---':
|
||||
return None
|
||||
|
||||
|
||||
# Find closing ---
|
||||
end_line = None
|
||||
for i in range(self._current_line + 1, len(self._lines)):
|
||||
if self._lines[i].strip() == '---':
|
||||
end_line = i
|
||||
break
|
||||
|
||||
|
||||
if end_line is None:
|
||||
return None
|
||||
|
||||
|
||||
# Extract frontmatter content
|
||||
frontmatter_lines = self._lines[self._current_line + 1:end_line]
|
||||
frontmatter_content = '\n'.join(frontmatter_lines)
|
||||
|
||||
'\n'.join(frontmatter_lines)
|
||||
|
||||
# Simple key: value parsing (not full YAML)
|
||||
meta = {}
|
||||
current_key = None
|
||||
current_value = []
|
||||
|
||||
|
||||
for line in frontmatter_lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
|
||||
# Check for new key
|
||||
match = re.match(r'^(\w+):\s*(.*)$', stripped)
|
||||
if match:
|
||||
# Save previous key
|
||||
if current_key:
|
||||
meta[current_key] = '\n'.join(current_value).strip()
|
||||
|
||||
|
||||
current_key = match.group(1)
|
||||
value = match.group(2)
|
||||
|
||||
|
||||
# Handle inline value
|
||||
if value:
|
||||
# Check if it's a list
|
||||
@@ -178,146 +174,146 @@ class MarkdownParser(BaseParser):
|
||||
meta[current_key].append(stripped[2:].strip().strip('"\''))
|
||||
elif current_key:
|
||||
current_value.append(stripped)
|
||||
|
||||
|
||||
# Save last key
|
||||
if current_key:
|
||||
meta[current_key] = '\n'.join(current_value).strip()
|
||||
|
||||
|
||||
# Advance past frontmatter
|
||||
self._current_line = end_line + 1
|
||||
|
||||
|
||||
return meta
|
||||
|
||||
def _parse_block(self) -> Optional[ContentBlock]:
|
||||
|
||||
def _parse_block(self) -> ContentBlock | None:
|
||||
"""Parse a single block at current position."""
|
||||
line = self._current_line
|
||||
if line >= len(self._lines):
|
||||
return None
|
||||
|
||||
|
||||
current = self._lines[line]
|
||||
stripped = current.strip()
|
||||
|
||||
|
||||
# Skip empty lines
|
||||
if not stripped:
|
||||
return None
|
||||
|
||||
|
||||
# Skip HTML comments
|
||||
if stripped.startswith('<!--'):
|
||||
return self._parse_html_comment()
|
||||
|
||||
|
||||
# ATX Headers
|
||||
if stripped.startswith('#'):
|
||||
return self._parse_atx_header()
|
||||
|
||||
|
||||
# Setext headers (underline style)
|
||||
if self._is_setext_header(line):
|
||||
return self._parse_setext_header()
|
||||
|
||||
|
||||
# Code fence
|
||||
if stripped.startswith('```'):
|
||||
return self._parse_code_fence()
|
||||
|
||||
|
||||
# Indented code block
|
||||
if current.startswith(' ') or current.startswith('\t'):
|
||||
return self._parse_indented_code()
|
||||
|
||||
|
||||
# Table
|
||||
if '|' in stripped and self._is_table(line):
|
||||
return self._parse_table()
|
||||
|
||||
|
||||
# Blockquote (check for admonition)
|
||||
if stripped.startswith('>'):
|
||||
return self._parse_blockquote()
|
||||
|
||||
|
||||
# Horizontal rule
|
||||
if re.match(r'^[\-*_]{3,}\s*$', stripped):
|
||||
return self._parse_horizontal_rule()
|
||||
|
||||
|
||||
# List
|
||||
list_type = self._detect_list_type(stripped)
|
||||
if list_type:
|
||||
return self._parse_list(list_type)
|
||||
|
||||
|
||||
# Paragraph (default)
|
||||
return self._parse_paragraph()
|
||||
|
||||
|
||||
def _is_setext_header(self, line: int) -> bool:
|
||||
"""Check if current line is a Setext header."""
|
||||
if line + 1 >= len(self._lines):
|
||||
return False
|
||||
|
||||
|
||||
current = self._lines[line].strip()
|
||||
next_line = self._lines[line + 1].strip()
|
||||
|
||||
|
||||
if not current or not next_line:
|
||||
return False
|
||||
|
||||
|
||||
# H1: ===, H2: ---
|
||||
return re.match(r'^[=-]+$', next_line) is not None
|
||||
|
||||
|
||||
def _parse_atx_header(self) -> ContentBlock:
|
||||
"""Parse ATX style header (# Header)."""
|
||||
line = self._lines[self._current_line]
|
||||
match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
||||
|
||||
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
text = match.group(2).strip()
|
||||
# Remove trailing hashes
|
||||
text = re.sub(r'\s+#+$', '', text)
|
||||
|
||||
|
||||
anchor = self._create_anchor(text)
|
||||
|
||||
|
||||
heading = Heading(
|
||||
level=level,
|
||||
text=text,
|
||||
id=anchor,
|
||||
source_line=self._current_line + 1,
|
||||
)
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.HEADING,
|
||||
content=text,
|
||||
metadata={'heading_data': heading},
|
||||
source_line=self._current_line + 1,
|
||||
)
|
||||
|
||||
|
||||
return self._parse_paragraph()
|
||||
|
||||
|
||||
def _parse_setext_header(self) -> ContentBlock:
|
||||
"""Parse Setext style header (underline)."""
|
||||
text = self._lines[self._current_line].strip()
|
||||
underline = self._lines[self._current_line + 1].strip()
|
||||
|
||||
|
||||
level = 1 if underline[0] == '=' else 2
|
||||
anchor = self._create_anchor(text)
|
||||
|
||||
|
||||
heading = Heading(
|
||||
level=level,
|
||||
text=text,
|
||||
id=anchor,
|
||||
source_line=self._current_line + 1,
|
||||
)
|
||||
|
||||
|
||||
# Skip underline
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.HEADING,
|
||||
content=text,
|
||||
metadata={'heading_data': heading},
|
||||
source_line=self._current_line,
|
||||
)
|
||||
|
||||
|
||||
def _parse_code_fence(self) -> ContentBlock:
|
||||
"""Parse fenced code block."""
|
||||
line = self._lines[self._current_line]
|
||||
match = re.match(r'^```(\w+)?\s*$', line.strip())
|
||||
language = match.group(1) if match else None
|
||||
|
||||
|
||||
start_line = self._current_line
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
code_lines = []
|
||||
while self._current_line < len(self._lines):
|
||||
current_line = self._lines[self._current_line]
|
||||
@@ -325,19 +321,19 @@ class MarkdownParser(BaseParser):
|
||||
break
|
||||
code_lines.append(current_line)
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
code = '\n'.join(code_lines)
|
||||
|
||||
|
||||
# Detect language if not specified
|
||||
detected_lang, confidence = self.quality_scorer.detect_language(code)
|
||||
if not language and confidence > 0.6:
|
||||
language = detected_lang
|
||||
elif not language:
|
||||
language = 'text'
|
||||
|
||||
|
||||
# Score code quality
|
||||
quality = self.quality_scorer.score_code_block(code, language)
|
||||
|
||||
|
||||
code_block = CodeBlock(
|
||||
code=code,
|
||||
language=language,
|
||||
@@ -345,7 +341,7 @@ class MarkdownParser(BaseParser):
|
||||
confidence=confidence if language == detected_lang else 1.0,
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.CODE_BLOCK,
|
||||
content=code,
|
||||
@@ -356,19 +352,19 @@ class MarkdownParser(BaseParser):
|
||||
source_line=start_line + 1,
|
||||
quality_score=quality,
|
||||
)
|
||||
|
||||
|
||||
def _parse_indented_code(self) -> ContentBlock:
|
||||
"""Parse indented code block."""
|
||||
code_lines = []
|
||||
start_line = self._current_line
|
||||
|
||||
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line]
|
||||
if not line.strip():
|
||||
code_lines.append('')
|
||||
self._current_line += 1
|
||||
continue
|
||||
|
||||
|
||||
if line.startswith(' '):
|
||||
code_lines.append(line[4:])
|
||||
elif line.startswith('\t'):
|
||||
@@ -376,15 +372,15 @@ class MarkdownParser(BaseParser):
|
||||
else:
|
||||
self._current_line -= 1
|
||||
break
|
||||
|
||||
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
code = '\n'.join(code_lines).rstrip()
|
||||
|
||||
|
||||
# Detect language
|
||||
detected_lang, confidence = self.quality_scorer.detect_language(code)
|
||||
quality = self.quality_scorer.score_code_block(code, detected_lang)
|
||||
|
||||
|
||||
code_block = CodeBlock(
|
||||
code=code,
|
||||
language=detected_lang if confidence > 0.6 else 'text',
|
||||
@@ -392,7 +388,7 @@ class MarkdownParser(BaseParser):
|
||||
confidence=confidence,
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.CODE_BLOCK,
|
||||
content=code,
|
||||
@@ -403,52 +399,49 @@ class MarkdownParser(BaseParser):
|
||||
source_line=start_line + 1,
|
||||
quality_score=quality,
|
||||
)
|
||||
|
||||
|
||||
def _is_table(self, line: int) -> bool:
|
||||
"""Check if current position is a table."""
|
||||
if line + 1 >= len(self._lines):
|
||||
return False
|
||||
|
||||
|
||||
current = self._lines[line].strip()
|
||||
next_line = self._lines[line + 1].strip()
|
||||
|
||||
|
||||
# Check for table separator line
|
||||
if re.match(r'^[\|:-]+$', next_line) and '|' in current:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
return bool(re.match(r'^[\|:-]+$', next_line) and '|' in current)
|
||||
|
||||
def _parse_table(self) -> ContentBlock:
|
||||
"""Parse a GFM table."""
|
||||
rows = []
|
||||
headers = None
|
||||
start_line = self._current_line
|
||||
|
||||
|
||||
# Parse header row
|
||||
header_line = self._lines[self._current_line].strip()
|
||||
headers = [cell.strip() for cell in header_line.split('|')]
|
||||
headers = [h for h in headers if h] # Remove empty
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
# Skip separator line (|:--:| etc.)
|
||||
if self._current_line < len(self._lines):
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
# Parse data rows
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line].strip()
|
||||
|
||||
|
||||
if not line or '|' not in line:
|
||||
self._current_line -= 1
|
||||
break
|
||||
|
||||
|
||||
cells = [cell.strip() for cell in line.split('|')]
|
||||
cells = [c for c in cells if c]
|
||||
if cells:
|
||||
rows.append(cells)
|
||||
|
||||
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
table = Table(
|
||||
rows=rows,
|
||||
headers=headers,
|
||||
@@ -456,9 +449,9 @@ class MarkdownParser(BaseParser):
|
||||
source_format='markdown',
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
quality = self.quality_scorer.score_table(table)
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.TABLE,
|
||||
content=f"[Table: {len(rows)} rows]",
|
||||
@@ -466,25 +459,25 @@ class MarkdownParser(BaseParser):
|
||||
source_line=start_line + 1,
|
||||
quality_score=quality,
|
||||
)
|
||||
|
||||
|
||||
def _parse_blockquote(self) -> ContentBlock:
|
||||
"""Parse a blockquote, checking for admonitions."""
|
||||
lines = []
|
||||
start_line = self._current_line
|
||||
admonition_type = None
|
||||
admonition_content = []
|
||||
|
||||
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line]
|
||||
stripped = line.strip()
|
||||
|
||||
|
||||
if not stripped.startswith('>'):
|
||||
self._current_line -= 1
|
||||
break
|
||||
|
||||
|
||||
# Remove > prefix
|
||||
content = line[1:].strip() if line.startswith('> ') else line[1:].strip()
|
||||
|
||||
|
||||
# Check for GitHub-style admonition: > [!NOTE]
|
||||
admonition_match = re.match(r'^\[!([\w]+)\]\s*(.*)$', content)
|
||||
if admonition_match and not admonition_type:
|
||||
@@ -497,9 +490,9 @@ class MarkdownParser(BaseParser):
|
||||
admonition_content.append(content)
|
||||
else:
|
||||
lines.append(content)
|
||||
|
||||
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
# Return as admonition if detected
|
||||
if admonition_type:
|
||||
return ContentBlock(
|
||||
@@ -508,7 +501,7 @@ class MarkdownParser(BaseParser):
|
||||
metadata={'admonition_type': admonition_type},
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
# Regular blockquote
|
||||
content = '\n'.join(lines)
|
||||
return ContentBlock(
|
||||
@@ -517,24 +510,23 @@ class MarkdownParser(BaseParser):
|
||||
metadata={'block_type': 'blockquote'},
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
def _parse_html_comment(self) -> Optional[ContentBlock]:
|
||||
|
||||
def _parse_html_comment(self) -> ContentBlock | None:
|
||||
"""Parse HTML comment (usually skip)."""
|
||||
start_line = self._current_line
|
||||
content_lines = []
|
||||
|
||||
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line]
|
||||
content_lines.append(line)
|
||||
|
||||
|
||||
if '-->' in line:
|
||||
break
|
||||
|
||||
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
# Skip comments in output (could optionally include)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_horizontal_rule(self) -> ContentBlock:
|
||||
"""Parse horizontal rule."""
|
||||
return ContentBlock(
|
||||
@@ -543,28 +535,28 @@ class MarkdownParser(BaseParser):
|
||||
metadata={'element': 'horizontal_rule'},
|
||||
source_line=self._current_line + 1,
|
||||
)
|
||||
|
||||
def _detect_list_type(self, stripped: str) -> Optional[ListType]:
|
||||
|
||||
def _detect_list_type(self, stripped: str) -> ListType | None:
|
||||
"""Detect if line starts a list and which type."""
|
||||
if re.match(r'^[-*+]\s+', stripped):
|
||||
return ListType.BULLET
|
||||
if re.match(r'^\d+\.\s+', stripped):
|
||||
return ListType.NUMBERED
|
||||
return None
|
||||
|
||||
|
||||
def _parse_list(self, list_type: ListType) -> ContentBlock:
|
||||
"""Parse a list."""
|
||||
items = []
|
||||
start_line = self._current_line
|
||||
|
||||
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line]
|
||||
stripped = line.strip()
|
||||
|
||||
|
||||
if not stripped:
|
||||
self._current_line += 1
|
||||
continue
|
||||
|
||||
|
||||
# Check if still in list
|
||||
if list_type == ListType.BULLET:
|
||||
match = re.match(r'^[-*+]\s+(.+)$', stripped)
|
||||
@@ -578,9 +570,9 @@ class MarkdownParser(BaseParser):
|
||||
self._current_line -= 1
|
||||
break
|
||||
items.append(match.group(1))
|
||||
|
||||
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.LIST,
|
||||
content=f"{len(items)} items",
|
||||
@@ -590,20 +582,20 @@ class MarkdownParser(BaseParser):
|
||||
},
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
def _parse_paragraph(self) -> ContentBlock:
|
||||
"""Parse a paragraph."""
|
||||
lines = []
|
||||
start_line = self._current_line
|
||||
|
||||
|
||||
while self._current_line < len(self._lines):
|
||||
line = self._lines[self._current_line]
|
||||
stripped = line.strip()
|
||||
|
||||
|
||||
# End of paragraph
|
||||
if not stripped:
|
||||
break
|
||||
|
||||
|
||||
# Check for block-level elements
|
||||
if stripped.startswith('#'):
|
||||
break
|
||||
@@ -619,45 +611,45 @@ class MarkdownParser(BaseParser):
|
||||
break
|
||||
if self._is_setext_header(self._current_line):
|
||||
break
|
||||
|
||||
|
||||
lines.append(stripped)
|
||||
self._current_line += 1
|
||||
|
||||
|
||||
content = ' '.join(lines)
|
||||
|
||||
|
||||
# Process inline elements
|
||||
content = self._process_inline(content)
|
||||
|
||||
|
||||
return ContentBlock(
|
||||
type=ContentBlockType.PARAGRAPH,
|
||||
content=content,
|
||||
source_line=start_line + 1,
|
||||
)
|
||||
|
||||
|
||||
def _process_inline(self, text: str) -> str:
|
||||
"""Process inline Markdown elements."""
|
||||
# Links [text](url)
|
||||
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\1](\2)', text)
|
||||
|
||||
|
||||
# Images 
|
||||
text = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', r'', text)
|
||||
|
||||
|
||||
# Code `code`
|
||||
text = re.sub(r'`([^`]+)`', r'`\1`', text)
|
||||
|
||||
|
||||
# Bold **text** or __text__
|
||||
text = re.sub(r'\*\*([^*]+)\*\*', r'**\1**', text)
|
||||
text = re.sub(r'__([^_]+)__', r'**\1**', text)
|
||||
|
||||
|
||||
# Italic *text* or _text_
|
||||
text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'*\1*', text)
|
||||
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'*\1*', text)
|
||||
|
||||
|
||||
# Strikethrough ~~text~~
|
||||
text = re.sub(r'~~([^~]+)~~', r'~~\1~~', text)
|
||||
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _create_anchor(self, text: str) -> str:
|
||||
"""Create URL anchor from heading text."""
|
||||
anchor = text.lower()
|
||||
@@ -665,7 +657,7 @@ class MarkdownParser(BaseParser):
|
||||
anchor = anchor.replace(' ', '-')
|
||||
anchor = re.sub(r'-+', '-', anchor)
|
||||
return anchor.strip('-')
|
||||
|
||||
|
||||
def _extract_specialized_content(self, document: Document):
|
||||
"""Extract specialized content lists from blocks."""
|
||||
for block in document.blocks:
|
||||
@@ -674,19 +666,19 @@ class MarkdownParser(BaseParser):
|
||||
heading_data = block.metadata.get('heading_data')
|
||||
if heading_data:
|
||||
document.headings.append(heading_data)
|
||||
|
||||
|
||||
# Extract code blocks
|
||||
elif block.type == ContentBlockType.CODE_BLOCK:
|
||||
code_data = block.metadata.get('code_data')
|
||||
if code_data:
|
||||
document.code_blocks.append(code_data)
|
||||
|
||||
|
||||
# Extract tables
|
||||
elif block.type == ContentBlockType.TABLE:
|
||||
table_data = block.metadata.get('table_data')
|
||||
if table_data:
|
||||
document.tables.append(table_data)
|
||||
|
||||
|
||||
# Extract images from paragraphs (simplified)
|
||||
elif block.type == ContentBlockType.PARAGRAPH:
|
||||
content = block.content
|
||||
@@ -698,7 +690,7 @@ class MarkdownParser(BaseParser):
|
||||
source_line=block.source_line,
|
||||
)
|
||||
document.images.append(image)
|
||||
|
||||
|
||||
# Extract links
|
||||
link_matches = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
|
||||
for text, url in link_matches:
|
||||
@@ -709,14 +701,14 @@ class MarkdownParser(BaseParser):
|
||||
ref_type = CrossRefType.EXTERNAL
|
||||
else:
|
||||
ref_type = CrossRefType.INTERNAL
|
||||
|
||||
|
||||
xref = CrossReference(
|
||||
ref_type=ref_type,
|
||||
target=url,
|
||||
text=text,
|
||||
source_line=block.source_line,
|
||||
)
|
||||
|
||||
|
||||
if ref_type == CrossRefType.EXTERNAL:
|
||||
document.external_links.append(xref)
|
||||
else:
|
||||
|
||||
@@ -5,7 +5,7 @@ Wraps PDFExtractor to provide unified Document output.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .quality_scorer import QualityScorer
|
||||
@@ -14,7 +14,6 @@ from .unified_structure import (
|
||||
ContentBlock,
|
||||
ContentBlockType,
|
||||
Document,
|
||||
ExtractionStats,
|
||||
Heading,
|
||||
Image,
|
||||
Table,
|
||||
@@ -33,13 +32,13 @@ except ImportError:
|
||||
class PdfParser(BaseParser):
|
||||
"""
|
||||
Parser for PDF documents.
|
||||
|
||||
|
||||
Wraps the existing PDFExtractor to provide unified Document output
|
||||
while maintaining all PDF-specific features (OCR, image extraction,
|
||||
table extraction, etc.).
|
||||
"""
|
||||
|
||||
def __init__(self, options: Optional[dict[str, Any]] = None):
|
||||
def __init__(self, options: dict[str, Any] | None = None):
|
||||
super().__init__(options)
|
||||
self.pdf_options = {
|
||||
"verbose": self.options.get("verbose", False),
|
||||
@@ -71,7 +70,7 @@ class PdfParser(BaseParser):
|
||||
def _parse_content(self, content: str, source_path: str) -> Document:
|
||||
"""
|
||||
Parse PDF content into Document.
|
||||
|
||||
|
||||
Note: For PDF, we need the file path, not content string.
|
||||
This method is mainly for API compatibility.
|
||||
"""
|
||||
@@ -83,10 +82,10 @@ class PdfParser(BaseParser):
|
||||
def parse_file(self, path: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse a PDF file.
|
||||
|
||||
|
||||
Args:
|
||||
path: Path to PDF file
|
||||
|
||||
|
||||
Returns:
|
||||
ParseResult with Document or error info
|
||||
"""
|
||||
@@ -97,7 +96,7 @@ class PdfParser(BaseParser):
|
||||
result.errors.append(f"File not found: {path}")
|
||||
return result
|
||||
|
||||
if not path.suffix.lower() == ".pdf":
|
||||
if path.suffix.lower() != ".pdf":
|
||||
result.errors.append(f"Not a PDF file: {path}")
|
||||
return result
|
||||
|
||||
@@ -127,7 +126,7 @@ class PdfParser(BaseParser):
|
||||
|
||||
# Convert to unified Document
|
||||
document = self._convert_to_document(extraction_result, str(path))
|
||||
|
||||
|
||||
result.document = document
|
||||
result.success = True
|
||||
result.warnings.extend(document.stats.warnings)
|
||||
@@ -157,13 +156,13 @@ class PdfParser(BaseParser):
|
||||
|
||||
# Process pages
|
||||
pages = extraction_result.get("pages", [])
|
||||
|
||||
|
||||
for page_num, page_data in enumerate(pages):
|
||||
# Add page heading
|
||||
page_heading = f"Page {page_num + 1}"
|
||||
if page_data.get("headings"):
|
||||
page_heading = page_data["headings"][0].get("text", page_heading)
|
||||
|
||||
|
||||
document.blocks.append(
|
||||
ContentBlock(
|
||||
type=ContentBlockType.HEADING,
|
||||
@@ -200,7 +199,7 @@ class PdfParser(BaseParser):
|
||||
source_line=page_num + 1,
|
||||
)
|
||||
document.code_blocks.append(code_block)
|
||||
|
||||
|
||||
document.blocks.append(
|
||||
ContentBlock(
|
||||
type=ContentBlockType.CODE_BLOCK,
|
||||
@@ -224,7 +223,7 @@ class PdfParser(BaseParser):
|
||||
source_line=page_num + 1,
|
||||
)
|
||||
document.tables.append(table)
|
||||
|
||||
|
||||
quality = self.quality_scorer.score_table(table)
|
||||
document.blocks.append(
|
||||
ContentBlock(
|
||||
@@ -268,7 +267,7 @@ class PdfParser(BaseParser):
|
||||
def parse(self, source: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse PDF from source.
|
||||
|
||||
|
||||
For PDF files, source should be a file path.
|
||||
"""
|
||||
if isinstance(source, str) and Path(source).exists():
|
||||
|
||||
@@ -8,14 +8,13 @@ Provides consistent quality scoring across all parsers for:
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from .unified_structure import CodeBlock, Table, ContentBlock
|
||||
from .unified_structure import Table, ContentBlock
|
||||
|
||||
|
||||
class QualityScorer:
|
||||
"""Score the quality of extracted content."""
|
||||
|
||||
|
||||
# Language patterns for detection and validation
|
||||
LANGUAGE_PATTERNS = {
|
||||
'python': {
|
||||
@@ -122,26 +121,26 @@ class QualityScorer:
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
def score_code_block(self, code: str, language: Optional[str] = None) -> float:
|
||||
|
||||
def score_code_block(self, code: str, language: str | None = None) -> float:
|
||||
"""
|
||||
Score a code block for quality (0-10).
|
||||
|
||||
|
||||
Args:
|
||||
code: The code content
|
||||
language: Detected or specified language
|
||||
|
||||
|
||||
Returns:
|
||||
Quality score from 0-10
|
||||
"""
|
||||
score = 5.0 # Start neutral
|
||||
|
||||
|
||||
if not code or not code.strip():
|
||||
return 0.0
|
||||
|
||||
|
||||
code = code.strip()
|
||||
lines = [l for l in code.split('\n') if l.strip()]
|
||||
|
||||
lines = [line for line in code.split('\n') if line.strip()]
|
||||
|
||||
# Factor 1: Length appropriateness
|
||||
code_len = len(code)
|
||||
if 50 <= code_len <= 1000:
|
||||
@@ -150,22 +149,22 @@ class QualityScorer:
|
||||
score -= 1.0 # Too long
|
||||
elif code_len < 20:
|
||||
score -= 2.0 # Too short
|
||||
|
||||
|
||||
# Factor 2: Line count
|
||||
if 3 <= len(lines) <= 50:
|
||||
score += 1.0
|
||||
elif len(lines) > 100:
|
||||
score -= 0.5
|
||||
|
||||
|
||||
# Factor 3: Language-specific validation
|
||||
if language and language in self.LANGUAGE_PATTERNS:
|
||||
lang_patterns = self.LANGUAGE_PATTERNS[language]
|
||||
|
||||
|
||||
# Check for keywords
|
||||
keyword_matches = sum(1 for kw in lang_patterns['keywords'] if kw in code)
|
||||
if keyword_matches >= 2:
|
||||
score += 1.0
|
||||
|
||||
|
||||
# Check for syntax patterns
|
||||
syntax_matches = sum(
|
||||
1 for pattern, _ in lang_patterns['syntax_checks']
|
||||
@@ -173,27 +172,27 @@ class QualityScorer:
|
||||
)
|
||||
if syntax_matches >= 1:
|
||||
score += 1.0
|
||||
|
||||
|
||||
# Factor 4: Structural quality
|
||||
# Check for function/class definitions
|
||||
if re.search(r'\b(def|function|func|fn|class|public class)\b', code):
|
||||
score += 1.5
|
||||
|
||||
|
||||
# Check for meaningful variable names (not just x, y, i)
|
||||
meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower())
|
||||
if len(meaningful_vars) >= 3:
|
||||
score += 0.5
|
||||
|
||||
|
||||
# Factor 5: Syntax validation (generic)
|
||||
is_valid, issues = self._validate_syntax(code, language)
|
||||
if is_valid:
|
||||
score += 1.0
|
||||
else:
|
||||
score -= len(issues) * 0.3
|
||||
|
||||
|
||||
# Factor 6: Comment/code ratio
|
||||
comment_lines = sum(
|
||||
1 for line in lines
|
||||
1 for line in lines
|
||||
if line.strip().startswith(('#', '//', '/*', '*', '--', '<!--'))
|
||||
)
|
||||
if len(lines) > 0:
|
||||
@@ -202,14 +201,14 @@ class QualityScorer:
|
||||
score += 0.5 # Good comment ratio
|
||||
elif comment_ratio > 0.6:
|
||||
score -= 1.0 # Too many comments
|
||||
|
||||
|
||||
# Clamp to 0-10
|
||||
return max(0.0, min(10.0, score))
|
||||
|
||||
def _validate_syntax(self, code: str, language: Optional[str]) -> tuple[bool, list[str]]:
|
||||
|
||||
def _validate_syntax(self, code: str, language: str | None) -> tuple[bool, list[str]]:
|
||||
"""Basic syntax validation."""
|
||||
issues = []
|
||||
|
||||
|
||||
# Check for balanced braces/brackets
|
||||
pairs = [('{', '}'), ('[', ']'), ('(', ')')]
|
||||
for open_char, close_char in pairs:
|
||||
@@ -217,13 +216,13 @@ class QualityScorer:
|
||||
close_count = code.count(close_char)
|
||||
if abs(open_count - close_count) > 2:
|
||||
issues.append(f"Unbalanced {open_char}{close_char}")
|
||||
|
||||
|
||||
# Check for common natural language indicators
|
||||
common_words = ['the', 'and', 'for', 'with', 'this', 'that', 'have', 'from', 'they']
|
||||
word_count = sum(1 for word in common_words if f' {word} ' in code.lower())
|
||||
if word_count > 5 and len(code.split()) < 100:
|
||||
issues.append("May be natural language")
|
||||
|
||||
|
||||
# Language-specific checks
|
||||
if language == 'python':
|
||||
# Check for mixed indentation
|
||||
@@ -235,32 +234,32 @@ class QualityScorer:
|
||||
indent_chars.add('tab')
|
||||
if len(indent_chars) > 1:
|
||||
issues.append("Mixed tabs and spaces")
|
||||
|
||||
|
||||
elif language == 'json':
|
||||
try:
|
||||
import json
|
||||
json.loads(code)
|
||||
except Exception as e:
|
||||
issues.append(f"Invalid JSON: {str(e)[:50]}")
|
||||
|
||||
|
||||
return len(issues) == 0, issues
|
||||
|
||||
|
||||
def score_table(self, table: Table) -> float:
|
||||
"""
|
||||
Score a table for quality (0-10).
|
||||
|
||||
|
||||
Args:
|
||||
table: The table to score
|
||||
|
||||
|
||||
Returns:
|
||||
Quality score from 0-10
|
||||
"""
|
||||
score = 5.0
|
||||
|
||||
|
||||
# Factor 1: Has headers
|
||||
if table.headers:
|
||||
score += 1.0
|
||||
|
||||
|
||||
# Factor 2: Consistent column count
|
||||
if table.rows:
|
||||
col_counts = [len(row) for row in table.rows]
|
||||
@@ -268,18 +267,18 @@ class QualityScorer:
|
||||
score += 1.0 # Consistent
|
||||
else:
|
||||
score -= 1.0 # Inconsistent
|
||||
|
||||
|
||||
# Factor 3: Reasonable size
|
||||
if 2 <= table.num_rows <= 100:
|
||||
score += 0.5
|
||||
elif table.num_rows > 500:
|
||||
score -= 0.5
|
||||
|
||||
|
||||
if 2 <= table.num_cols <= 10:
|
||||
score += 0.5
|
||||
elif table.num_cols > 20:
|
||||
score -= 0.5
|
||||
|
||||
|
||||
# Factor 4: Non-empty cells
|
||||
if table.rows:
|
||||
total_cells = sum(len(row) for row in table.rows)
|
||||
@@ -290,72 +289,69 @@ class QualityScorer:
|
||||
score += 1.0
|
||||
elif empty_ratio > 0.5:
|
||||
score -= 1.0
|
||||
|
||||
|
||||
# Factor 5: Has caption (good for API docs)
|
||||
if table.caption:
|
||||
score += 0.5
|
||||
|
||||
|
||||
return max(0.0, min(10.0, score))
|
||||
|
||||
|
||||
def score_content_block(self, block: ContentBlock) -> float:
|
||||
"""Score a generic content block."""
|
||||
score = 5.0
|
||||
content = block.content
|
||||
|
||||
|
||||
if not content:
|
||||
return 0.0
|
||||
|
||||
|
||||
# Length check
|
||||
if len(content) < 10:
|
||||
score -= 2.0
|
||||
elif len(content) > 1000:
|
||||
score += 0.5
|
||||
|
||||
|
||||
# Structure check
|
||||
if '.' in content: # Has sentences
|
||||
score += 0.5
|
||||
if content[0].isupper(): # Starts with capital
|
||||
score += 0.5
|
||||
|
||||
|
||||
return max(0.0, min(10.0, score))
|
||||
|
||||
|
||||
def detect_language(self, code: str) -> tuple[str, float]:
|
||||
"""
|
||||
Detect programming language from code.
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (language, confidence)
|
||||
"""
|
||||
code = code.strip()
|
||||
if not code:
|
||||
return 'unknown', 0.0
|
||||
|
||||
|
||||
scores = {}
|
||||
|
||||
|
||||
for lang, patterns in self.LANGUAGE_PATTERNS.items():
|
||||
score = 0.0
|
||||
|
||||
|
||||
# Check keywords
|
||||
keyword_hits = sum(1 for kw in patterns['keywords'] if kw in code)
|
||||
score += keyword_hits * 0.5
|
||||
|
||||
|
||||
# Check syntax patterns
|
||||
for pattern, _ in patterns['syntax_checks']:
|
||||
if re.search(pattern, code, re.MULTILINE):
|
||||
score += 1.0
|
||||
|
||||
|
||||
scores[lang] = score
|
||||
|
||||
|
||||
if not scores:
|
||||
return 'unknown', 0.0
|
||||
|
||||
|
||||
best_lang = max(scores, key=scores.get)
|
||||
best_score = scores[best_lang]
|
||||
|
||||
|
||||
# Normalize confidence
|
||||
if best_score >= 3:
|
||||
confidence = min(1.0, best_score / 5)
|
||||
else:
|
||||
confidence = best_score / 10
|
||||
|
||||
confidence = min(1.0, best_score / 5) if best_score >= 3 else best_score / 10
|
||||
|
||||
return best_lang, confidence
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,8 +7,8 @@ with a consistent structure.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
from enum import Enum, auto
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ContentBlockType(Enum):
|
||||
@@ -76,20 +76,20 @@ class Heading:
|
||||
"""A document heading/section title."""
|
||||
level: int # 1-6 for h1-h6, or 1+ for RST underline levels
|
||||
text: str
|
||||
id: Optional[str] = None # Anchor ID
|
||||
source_line: Optional[int] = None
|
||||
id: str | None = None # Anchor ID
|
||||
source_line: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeBlock:
|
||||
"""A code block with metadata."""
|
||||
code: str
|
||||
language: Optional[str] = None
|
||||
quality_score: Optional[float] = None # 0-10
|
||||
confidence: Optional[float] = None # Language detection confidence
|
||||
is_valid: Optional[bool] = None # Syntax validation result
|
||||
language: str | None = None
|
||||
quality_score: float | None = None # 0-10
|
||||
confidence: float | None = None # Language detection confidence
|
||||
is_valid: bool | None = None # Syntax validation result
|
||||
validation_issues: list[str] = field(default_factory=list)
|
||||
source_line: Optional[int] = None
|
||||
source_line: int | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@@ -97,11 +97,11 @@ class CodeBlock:
|
||||
class Table:
|
||||
"""A table with rows and cells."""
|
||||
rows: list[list[str]] # 2D array of cell content
|
||||
headers: Optional[list[str]] = None
|
||||
caption: Optional[str] = None
|
||||
col_widths: Optional[list[int]] = None
|
||||
headers: list[str] | None = None
|
||||
caption: str | None = None
|
||||
col_widths: list[int] | None = None
|
||||
source_format: str = "unknown" # 'simple', 'grid', 'list-table', 'markdown', 'pdf'
|
||||
source_line: Optional[int] = None
|
||||
source_line: int | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
@@ -120,8 +120,8 @@ class CrossReference:
|
||||
"""A cross-reference link."""
|
||||
ref_type: CrossRefType
|
||||
target: str # Target ID, URL, or path
|
||||
text: Optional[str] = None # Display text (if different from target)
|
||||
source_line: Optional[int] = None
|
||||
text: str | None = None # Display text (if different from target)
|
||||
source_line: int | None = None
|
||||
resolved: bool = False # Whether target was resolved
|
||||
|
||||
|
||||
@@ -129,9 +129,9 @@ class CrossReference:
|
||||
class Field:
|
||||
"""A field in a field list (RST :param:, :returns:, etc.)."""
|
||||
name: str # Field name (e.g., 'param', 'returns', 'type')
|
||||
arg: Optional[str] = None # Field argument (e.g., parameter name)
|
||||
arg: str | None = None # Field argument (e.g., parameter name)
|
||||
content: str = "" # Field content
|
||||
source_line: Optional[int] = None
|
||||
source_line: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -139,19 +139,19 @@ class DefinitionItem:
|
||||
"""A definition list item (term + definition)."""
|
||||
term: str
|
||||
definition: str
|
||||
classifier: Optional[str] = None # RST classifier (term : classifier)
|
||||
source_line: Optional[int] = None
|
||||
classifier: str | None = None # RST classifier (term : classifier)
|
||||
source_line: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Image:
|
||||
"""An image reference or embedded image."""
|
||||
source: str # URL, path, or base64 data
|
||||
alt_text: Optional[str] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
alt_text: str | None = None
|
||||
width: int | None = None
|
||||
height: int | None = None
|
||||
is_embedded: bool = False # True if data is embedded
|
||||
source_line: Optional[int] = None
|
||||
source_line: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -160,8 +160,8 @@ class ContentBlock:
|
||||
type: ContentBlockType
|
||||
content: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
source_line: Optional[int] = None
|
||||
quality_score: Optional[float] = None # 0-10
|
||||
source_line: int | None = None
|
||||
quality_score: float | None = None # 0-10
|
||||
|
||||
# Type-specific data (stored in metadata for flexibility)
|
||||
# For CODE_BLOCK: 'code_data' -> CodeBlock
|
||||
@@ -183,71 +183,71 @@ class ExtractionStats:
|
||||
cross_references: int = 0
|
||||
images: int = 0
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
processing_time_ms: Optional[float] = None
|
||||
processing_time_ms: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
"""
|
||||
Unified document structure - output of ALL parsers.
|
||||
|
||||
|
||||
This class provides a standardized representation of document content
|
||||
regardless of the source format (RST, Markdown, PDF, HTML).
|
||||
"""
|
||||
title: str = ""
|
||||
format: str = "" # 'markdown', 'rst', 'pdf', 'html', 'unknown'
|
||||
source_path: str = ""
|
||||
|
||||
|
||||
# Core content as blocks
|
||||
blocks: list[ContentBlock] = field(default_factory=list)
|
||||
|
||||
|
||||
# Navigation/Structure (derived from blocks for convenience)
|
||||
headings: list[Heading] = field(default_factory=list)
|
||||
sections: list[dict] = field(default_factory=list) # Hierarchical structure
|
||||
|
||||
|
||||
# References
|
||||
internal_links: list[CrossReference] = field(default_factory=list)
|
||||
external_links: list[CrossReference] = field(default_factory=list)
|
||||
|
||||
|
||||
# Specialized content (also in blocks, but extracted for easy access)
|
||||
code_blocks: list[CodeBlock] = field(default_factory=list)
|
||||
tables: list[Table] = field(default_factory=list)
|
||||
images: list[Image] = field(default_factory=list)
|
||||
|
||||
|
||||
# RST-specific (may be empty for other formats)
|
||||
field_lists: list[list[Field]] = field(default_factory=list)
|
||||
definition_lists: list[list[DefinitionItem]] = field(default_factory=list)
|
||||
substitutions: dict[str, str] = field(default_factory=dict)
|
||||
toc_trees: list[list[str]] = field(default_factory=list)
|
||||
|
||||
|
||||
# Metadata
|
||||
meta: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
# Extraction info
|
||||
stats: ExtractionStats = field(default_factory=ExtractionStats)
|
||||
|
||||
def to_markdown(self, options: Optional[dict] = None) -> str:
|
||||
|
||||
def to_markdown(self, options: dict | None = None) -> str:
|
||||
"""
|
||||
Convert unified structure to markdown output.
|
||||
|
||||
|
||||
Args:
|
||||
options: Optional formatting options
|
||||
- include_toc: bool = False
|
||||
- max_heading_level: int = 6
|
||||
- code_block_style: str = 'fenced' # or 'indented'
|
||||
- table_style: str = 'github' # or 'simple'
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown-formatted string
|
||||
"""
|
||||
from .formatters import MarkdownFormatter
|
||||
formatter = MarkdownFormatter(options or {})
|
||||
return formatter.format(self)
|
||||
|
||||
|
||||
def to_skill_format(self) -> dict[str, Any]:
|
||||
"""
|
||||
Convert to skill-seekers internal format.
|
||||
|
||||
|
||||
Returns:
|
||||
Dictionary compatible with existing skill-seekers pipelines
|
||||
"""
|
||||
@@ -292,7 +292,7 @@ class Document:
|
||||
"headings": self.stats.headings,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _extract_content_text(self) -> str:
|
||||
"""Extract plain text content from paragraphs."""
|
||||
paragraphs = []
|
||||
@@ -300,21 +300,21 @@ class Document:
|
||||
if block.type == ContentBlockType.PARAGRAPH:
|
||||
paragraphs.append(block.content)
|
||||
return "\n\n".join(paragraphs)
|
||||
|
||||
|
||||
def get_section_content(self, heading_text: str) -> list[ContentBlock]:
|
||||
"""
|
||||
Get all content blocks under a specific section heading.
|
||||
|
||||
|
||||
Args:
|
||||
heading_text: The section heading to find
|
||||
|
||||
|
||||
Returns:
|
||||
List of ContentBlock objects in that section
|
||||
"""
|
||||
result = []
|
||||
in_section = False
|
||||
section_level = None
|
||||
|
||||
|
||||
for block in self.blocks:
|
||||
if block.type == ContentBlockType.HEADING:
|
||||
heading_data = block.metadata.get('heading_data')
|
||||
@@ -325,29 +325,29 @@ class Document:
|
||||
elif in_section and heading_data.level <= section_level:
|
||||
# New section at same or higher level
|
||||
break
|
||||
|
||||
|
||||
if in_section:
|
||||
result.append(block)
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def find_blocks_by_type(self, block_type: ContentBlockType) -> list[ContentBlock]:
|
||||
"""Find all blocks of a specific type."""
|
||||
return [b for b in self.blocks if b.type == block_type]
|
||||
|
||||
|
||||
def find_code_by_language(self, language: str) -> list[CodeBlock]:
|
||||
"""Find all code blocks in a specific language."""
|
||||
return [cb for cb in self.code_blocks if cb.language == language]
|
||||
|
||||
|
||||
def find_tables_by_caption(self, pattern: str) -> list[Table]:
|
||||
"""Find tables with captions matching a pattern."""
|
||||
import re
|
||||
return [t for t in self.tables if t.caption and re.search(pattern, t.caption, re.I)]
|
||||
|
||||
|
||||
def get_api_summary(self) -> dict[str, Any]:
|
||||
"""
|
||||
Extract API summary if this is API documentation.
|
||||
|
||||
|
||||
Returns:
|
||||
Dictionary with 'properties', 'methods', 'signals', etc.
|
||||
"""
|
||||
@@ -355,7 +355,7 @@ class Document:
|
||||
properties_table = None
|
||||
methods_table = None
|
||||
signals_table = None
|
||||
|
||||
|
||||
for table in self.tables:
|
||||
if table.caption:
|
||||
cap_lower = table.caption.lower()
|
||||
@@ -365,21 +365,21 @@ class Document:
|
||||
methods_table = table
|
||||
elif 'signal' in cap_lower:
|
||||
signals_table = table
|
||||
|
||||
|
||||
return {
|
||||
"properties": self._parse_api_table(properties_table) if properties_table else [],
|
||||
"methods": self._parse_api_table(methods_table) if methods_table else [],
|
||||
"signals": self._parse_api_table(signals_table) if signals_table else [],
|
||||
}
|
||||
|
||||
def _parse_api_table(self, table: Optional[Table]) -> list[dict]:
|
||||
|
||||
def _parse_api_table(self, table: Table | None) -> list[dict]:
|
||||
"""Parse an API table into structured data."""
|
||||
if not table or not table.rows:
|
||||
return []
|
||||
|
||||
|
||||
results = []
|
||||
headers = table.headers or []
|
||||
|
||||
|
||||
for row in table.rows:
|
||||
if len(row) >= 2:
|
||||
item = {"name": row[0]}
|
||||
@@ -387,25 +387,25 @@ class Document:
|
||||
if i < len(row):
|
||||
item[header.lower().replace(' ', '_')] = row[i]
|
||||
results.append(item)
|
||||
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def merge_documents(docs: list[Document]) -> Document:
|
||||
"""
|
||||
Merge multiple documents into one.
|
||||
|
||||
|
||||
Useful for combining multiple source files into a single skill.
|
||||
"""
|
||||
if not docs:
|
||||
return Document()
|
||||
|
||||
|
||||
merged = Document(
|
||||
title=docs[0].title,
|
||||
format=docs[0].format,
|
||||
source_path="merged",
|
||||
)
|
||||
|
||||
|
||||
for doc in docs:
|
||||
merged.blocks.extend(doc.blocks)
|
||||
merged.headings.extend(doc.headings)
|
||||
@@ -418,12 +418,12 @@ def merge_documents(docs: list[Document]) -> Document:
|
||||
merged.definition_lists.extend(doc.definition_lists)
|
||||
merged.toc_trees.extend(doc.toc_trees)
|
||||
merged.meta.update(doc.meta)
|
||||
|
||||
|
||||
# Merge stats
|
||||
merged.stats.total_blocks = sum(d.stats.total_blocks for d in docs)
|
||||
merged.stats.code_blocks = sum(d.stats.code_blocks for d in docs)
|
||||
merged.stats.tables = sum(d.stats.tables for d in docs)
|
||||
merged.stats.headings = sum(d.stats.headings for d in docs)
|
||||
merged.stats.cross_references = sum(d.stats.cross_references for d in docs)
|
||||
|
||||
|
||||
return merged
|
||||
|
||||
@@ -707,14 +707,14 @@ def main():
|
||||
# Note: Runs independently of workflow system (they complement each other)
|
||||
if getattr(args, "enhance_level", 0) > 0:
|
||||
# Traditional AI enhancement (API or LOCAL mode)
|
||||
logger.info("\n" + "=" * 80)
|
||||
logger.info("🤖 Traditional AI Enhancement")
|
||||
logger.info("=" * 80)
|
||||
print("\n" + "=" * 80)
|
||||
print("🤖 Traditional AI Enhancement")
|
||||
print("=" * 80)
|
||||
if workflow_executed:
|
||||
logger.info(f" Running after workflow: {workflow_name}")
|
||||
logger.info(" (Workflow provides specialized analysis, enhancement provides general improvements)")
|
||||
logger.info(" (Use --enhance-workflow for more control)")
|
||||
logger.info("")
|
||||
print(f" Running after workflow: {workflow_name}")
|
||||
print(" (Workflow provides specialized analysis, enhancement provides general improvements)")
|
||||
print(" (Use --enhance-workflow for more control)")
|
||||
print("")
|
||||
# Note: PDF scraper uses enhance_level instead of enhance/enhance_local
|
||||
# This is consistent with the new unified enhancement system
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ import tempfile
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Literal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -166,10 +166,7 @@ class UnifiedEnhancer:
|
||||
return items
|
||||
|
||||
# Get appropriate prompt
|
||||
if custom_prompt:
|
||||
prompt_template = custom_prompt
|
||||
else:
|
||||
prompt_template = self._get_default_prompt(enhancement_type)
|
||||
prompt_template = custom_prompt or self._get_default_prompt(enhancement_type)
|
||||
|
||||
# Batch processing
|
||||
batch_size = (
|
||||
|
||||
@@ -571,7 +571,7 @@ class UnifiedScraper:
|
||||
if file_patterns:
|
||||
logger.info(f" File patterns: {', '.join(file_patterns)}")
|
||||
|
||||
results = analyze_codebase(
|
||||
analyze_codebase(
|
||||
directory=Path(local_path),
|
||||
output_dir=temp_output,
|
||||
depth=analysis_depth,
|
||||
|
||||
@@ -91,7 +91,7 @@ def _validate_yaml(text: str) -> dict:
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def list_workflows_tool(args: dict) -> list:
|
||||
def list_workflows_tool(_args: dict) -> list:
|
||||
"""Return all workflows with name, description, and source."""
|
||||
result: list[dict[str, str]] = []
|
||||
|
||||
|
||||
@@ -122,28 +122,28 @@ class TestCreateCommandArgvForwarding:
|
||||
|
||||
def _make_args(self, **kwargs):
|
||||
import argparse
|
||||
defaults = dict(
|
||||
enhance_workflow=None,
|
||||
enhance_stage=None,
|
||||
var=None,
|
||||
workflow_dry_run=False,
|
||||
enhance_level=0,
|
||||
output=None,
|
||||
name=None,
|
||||
description=None,
|
||||
config=None,
|
||||
api_key=None,
|
||||
dry_run=False,
|
||||
verbose=False,
|
||||
quiet=False,
|
||||
chunk_for_rag=False,
|
||||
chunk_size=512,
|
||||
chunk_overlap=50,
|
||||
preset=None,
|
||||
no_preserve_code_blocks=False,
|
||||
no_preserve_paragraphs=False,
|
||||
interactive_enhancement=False,
|
||||
)
|
||||
defaults = {
|
||||
"enhance_workflow": None,
|
||||
"enhance_stage": None,
|
||||
"var": None,
|
||||
"workflow_dry_run": False,
|
||||
"enhance_level": 0,
|
||||
"output": None,
|
||||
"name": None,
|
||||
"description": None,
|
||||
"config": None,
|
||||
"api_key": None,
|
||||
"dry_run": False,
|
||||
"verbose": False,
|
||||
"quiet": False,
|
||||
"chunk_for_rag": False,
|
||||
"chunk_size": 512,
|
||||
"chunk_overlap": 50,
|
||||
"preset": None,
|
||||
"no_preserve_code_blocks": False,
|
||||
"no_preserve_paragraphs": False,
|
||||
"interactive_enhancement": False,
|
||||
}
|
||||
defaults.update(kwargs)
|
||||
return argparse.Namespace(**defaults)
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ Basic usage:
|
||||
.. code-block:: gdscript
|
||||
|
||||
extends Node
|
||||
|
||||
|
||||
func _ready():
|
||||
print("Hello, World!")
|
||||
position = Vector2(100, 100)
|
||||
@@ -414,7 +414,7 @@ def calculate_average(numbers):
|
||||
|
||||
def test_good_table_score(self):
|
||||
"""Test quality score for good table."""
|
||||
from skill_seekers.cli.parsers.extractors import QualityScorer, Table
|
||||
from skill_seekers.cli.parsers.extractors import QualityScorer
|
||||
|
||||
scorer = QualityScorer()
|
||||
good_table = Table(
|
||||
|
||||
@@ -12,8 +12,7 @@ Covers:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from unittest.mock import MagicMock, patch, call
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -186,7 +185,7 @@ class TestRunWorkflowsMultiple:
|
||||
m.workflow.description = "desc"
|
||||
m.workflow.stages = []
|
||||
# Track call order
|
||||
m.run.side_effect = lambda *a, _n=wf_name, **kw: run_order.append(_n)
|
||||
m.run.side_effect = lambda *_a, _n=wf_name, **_kw: run_order.append(_n)
|
||||
engines.append(m)
|
||||
|
||||
with patch(
|
||||
@@ -208,7 +207,7 @@ class TestRunWorkflowsMultiple:
|
||||
good_engine.workflow.description = "desc"
|
||||
good_engine.workflow.stages = []
|
||||
|
||||
def side_effect(name, **kwargs):
|
||||
def side_effect(name, **_kwargs):
|
||||
if name == "bad-workflow":
|
||||
raise FileNotFoundError("not found")
|
||||
return good_engine
|
||||
@@ -341,9 +340,8 @@ class TestRunWorkflowsDryRun:
|
||||
with patch(
|
||||
"skill_seekers.cli.enhancement_workflow.WorkflowEngine",
|
||||
return_value=mock_engine,
|
||||
):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
run_workflows(args)
|
||||
), pytest.raises(SystemExit) as exc:
|
||||
run_workflows(args)
|
||||
|
||||
assert exc.value.code == 0
|
||||
mock_engine.preview.assert_called_once()
|
||||
@@ -366,9 +364,8 @@ class TestRunWorkflowsDryRun:
|
||||
with patch(
|
||||
"skill_seekers.cli.enhancement_workflow.WorkflowEngine",
|
||||
side_effect=engines,
|
||||
):
|
||||
with pytest.raises(SystemExit):
|
||||
run_workflows(args)
|
||||
), pytest.raises(SystemExit):
|
||||
run_workflows(args)
|
||||
|
||||
for engine in engines:
|
||||
engine.preview.assert_called_once()
|
||||
|
||||
@@ -9,7 +9,6 @@ Covers:
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@@ -290,7 +289,7 @@ class TestDeleteWorkflowTool:
|
||||
wf.write_text(MINIMAL_YAML, encoding="utf-8")
|
||||
|
||||
with _mock_bundled_names([]):
|
||||
result = delete_workflow_tool({"name": "my-wf"})
|
||||
delete_workflow_tool({"name": "my-wf"})
|
||||
|
||||
assert not wf.exists()
|
||||
|
||||
|
||||
@@ -10,11 +10,9 @@ Covers:
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
# Import the MODULE object (not just individual symbols) so we can patch it
|
||||
# directly via patch.object(). This survives any sys.modules manipulation by
|
||||
@@ -168,9 +166,8 @@ class TestCmdCopy:
|
||||
assert dest.read_text(encoding="utf-8") == MINIMAL_YAML
|
||||
|
||||
def test_copy_nonexistent(self, capsys, tmp_user_dir):
|
||||
with _mock_bundled_text({}):
|
||||
with _mock_bundled([]):
|
||||
rc = cmd_copy(["ghost-workflow"])
|
||||
with _mock_bundled_text({}), _mock_bundled([]):
|
||||
rc = cmd_copy(["ghost-workflow"])
|
||||
assert rc == 1
|
||||
assert "not found" in capsys.readouterr().err.lower()
|
||||
|
||||
@@ -403,9 +400,8 @@ class TestMain:
|
||||
from skill_seekers.cli.workflows_command import main
|
||||
|
||||
# tmp_user_dir is empty; mock bundled to return nothing
|
||||
with _mock_bundled([]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
main(["list"])
|
||||
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
|
||||
main(["list"])
|
||||
assert exc.value.code == 0
|
||||
|
||||
def test_main_validate_success(self, capsys, sample_yaml_file):
|
||||
@@ -423,31 +419,27 @@ class TestMain:
|
||||
assert "name: test-workflow" in capsys.readouterr().out
|
||||
|
||||
def test_main_show_not_found_exits_1(self, capsys, tmp_user_dir):
|
||||
with patch.object(_wf_cmd, "_workflow_yaml_text", return_value=None):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["show", "ghost"])
|
||||
with patch.object(_wf_cmd, "_workflow_yaml_text", return_value=None), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["show", "ghost"])
|
||||
assert exc.value.code == 1
|
||||
|
||||
def test_main_copy_single(self, capsys, tmp_user_dir):
|
||||
with _mock_bundled_text({"default": MINIMAL_YAML}):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "default"])
|
||||
with _mock_bundled_text({"default": MINIMAL_YAML}), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "default"])
|
||||
assert exc.value.code == 0
|
||||
assert (tmp_user_dir / "default.yaml").exists()
|
||||
|
||||
def test_main_copy_multiple(self, capsys, tmp_user_dir):
|
||||
texts = {"default": MINIMAL_YAML, "minimal": MINIMAL_YAML}
|
||||
with _mock_bundled_text(texts):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "default", "minimal"])
|
||||
with _mock_bundled_text(texts), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "default", "minimal"])
|
||||
assert exc.value.code == 0
|
||||
assert (tmp_user_dir / "default.yaml").exists()
|
||||
assert (tmp_user_dir / "minimal.yaml").exists()
|
||||
|
||||
def test_main_copy_not_found_exits_1(self, capsys, tmp_user_dir):
|
||||
with _mock_bundled_text({}), _mock_bundled([]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "ghost"])
|
||||
with _mock_bundled_text({}), _mock_bundled([]), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["copy", "ghost"])
|
||||
assert exc.value.code == 1
|
||||
|
||||
def test_main_add_single_file(self, capsys, tmp_user_dir, sample_yaml_file):
|
||||
@@ -484,32 +476,28 @@ class TestMain:
|
||||
|
||||
def test_main_remove_single(self, capsys, tmp_user_dir):
|
||||
(tmp_user_dir / "my-wf.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
|
||||
with _mock_bundled([]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "my-wf"])
|
||||
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "my-wf"])
|
||||
assert exc.value.code == 0
|
||||
assert not (tmp_user_dir / "my-wf.yaml").exists()
|
||||
|
||||
def test_main_remove_multiple(self, capsys, tmp_user_dir):
|
||||
(tmp_user_dir / "wf-a.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
|
||||
(tmp_user_dir / "wf-b.yaml").write_text(MINIMAL_YAML, encoding="utf-8")
|
||||
with _mock_bundled([]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "wf-a", "wf-b"])
|
||||
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "wf-a", "wf-b"])
|
||||
assert exc.value.code == 0
|
||||
assert not (tmp_user_dir / "wf-a.yaml").exists()
|
||||
assert not (tmp_user_dir / "wf-b.yaml").exists()
|
||||
|
||||
def test_main_remove_bundled_refused(self, capsys, tmp_user_dir):
|
||||
with _mock_bundled(["default"]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "default"])
|
||||
with _mock_bundled(["default"]), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "default"])
|
||||
assert exc.value.code == 1
|
||||
|
||||
def test_main_remove_not_found_exits_1(self, capsys, tmp_user_dir):
|
||||
with _mock_bundled([]):
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "ghost"])
|
||||
with _mock_bundled([]), pytest.raises(SystemExit) as exc:
|
||||
_wf_cmd.main(["remove", "ghost"])
|
||||
assert exc.value.code == 1
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user