fix: Resolve PDF processing (#267), How-To Guide (#242), Chinese README (#260) + code quality (#273)

Thanks @franklegolasyoung for the excellent work on the core fixes for issues #267, #242, and #260! 🙏

Your comprehensive approach to fixing PDF processing, expanding workflow detection, and improving the Chinese README documentation is much appreciated. I've added code quality fixes and comprehensive tests to ensure everything passes CI.

All 1266+ tests are now passing, and the issues are resolved! 🎉
This commit is contained in:
yusyus
2026-01-31 21:30:00 +03:00
committed by GitHub
parent f726a9abc5
commit 91bd2184e5
19 changed files with 622 additions and 174 deletions

View File

@@ -36,6 +36,7 @@ logger = logging.getLogger(__name__)
# Import config manager for settings
try:
from skill_seekers.cli.config_manager import get_config_manager
CONFIG_AVAILABLE = True
except ImportError:
CONFIG_AVAILABLE = False
@@ -107,7 +108,9 @@ class AIEnhancer:
logger.warning("⚠️ anthropic package not installed, falling back to LOCAL mode")
self.mode = "local"
except Exception as e:
logger.warning(f"⚠️ Failed to initialize API client: {e}, falling back to LOCAL mode")
logger.warning(
f"⚠️ Failed to initialize API client: {e}, falling back to LOCAL mode"
)
self.mode = "local"
if self.mode == "local" and self.enabled:
@@ -212,7 +215,8 @@ DO NOT include any explanation - just write the JSON file.
except json.JSONDecodeError:
# Try to find JSON in the response
import re
json_match = re.search(r'\[[\s\S]*\]|\{[\s\S]*\}', response_text)
json_match = re.search(r"\[[\s\S]*\]|\{[\s\S]*\}", response_text)
if json_match:
return json_match.group()
logger.warning("⚠️ Could not parse JSON from LOCAL response")

View File

@@ -377,11 +377,13 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
if header_match:
level = len(header_match.group(1))
text = header_match.group(2).strip()
structure["headers"].append({
"level": level,
"text": text,
"line": i + 1,
})
structure["headers"].append(
{
"level": level,
"text": text,
"line": i + 1,
}
)
# First h1 is the title
if level == 1 and structure["title"] is None:
structure["title"] = text
@@ -392,24 +394,30 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
language = match.group(1) or "text"
code = match.group(2).strip()
if len(code) > 0:
structure["code_blocks"].append({
"language": language,
"code": code[:500], # Truncate long code blocks
"full_length": len(code),
})
structure["code_blocks"].append(
{
"language": language,
"code": code[:500], # Truncate long code blocks
"full_length": len(code),
}
)
# Extract links
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
for match in link_pattern.finditer(content):
structure["links"].append({
"text": match.group(1),
"url": match.group(2),
})
structure["links"].append(
{
"text": match.group(1),
"url": match.group(2),
}
)
return structure
def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str:
def generate_markdown_summary(
content: str, structure: dict[str, Any], max_length: int = 500
) -> str:
"""
Generate a summary of markdown content.
@@ -522,12 +530,14 @@ def process_markdown_docs(
structure = extract_markdown_structure(content)
summary = generate_markdown_summary(content, structure)
doc_data.update({
"title": structure.get("title") or md_path.stem,
"structure": structure,
"summary": summary,
"content": content if depth == "full" else None,
})
doc_data.update(
{
"title": structure.get("title") or md_path.stem,
"structure": structure,
"summary": summary,
"content": content if depth == "full" else None,
}
)
processed_docs.append(doc_data)
# Track categories
@@ -563,6 +573,7 @@ def process_markdown_docs(
# Copy file to category folder
dest_path = category_dir / doc["filename"]
import shutil
shutil.copy2(src_path, dest_path)
except Exception as e:
logger.debug(f"Failed to copy {doc['path']}: {e}")
@@ -578,7 +589,9 @@ def process_markdown_docs(
with open(index_json, "w", encoding="utf-8") as f:
json.dump(index_data, f, indent=2, default=str)
logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories")
logger.info(
f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories"
)
logger.info(f"📁 Saved to: {docs_output_dir}")
return index_data
@@ -612,18 +625,22 @@ def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
"""Enhance docs using Claude API."""
try:
import anthropic
client = anthropic.Anthropic(api_key=api_key)
# Batch documents for efficiency
batch_size = 10
for i in range(0, len(docs), batch_size):
batch = docs[i:i + batch_size]
batch = docs[i : i + batch_size]
# Create prompt for batch
docs_text = "\n\n".join([
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
for d in batch if d.get("summary")
])
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
for d in batch
if d.get("summary")
]
)
if not docs_text:
continue
@@ -642,12 +659,13 @@ Return JSON with format:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}]
messages=[{"role": "user", "content": prompt}],
)
# Parse response and merge enhancements
try:
import re
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
@@ -676,10 +694,12 @@ def _enhance_docs_local(docs: list[dict]) -> list[dict]:
if not docs_with_summary:
return docs
docs_text = "\n\n".join([
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
for d in docs_with_summary[:20] # Limit to 20 docs
])
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
for d in docs_with_summary[:20] # Limit to 20 docs
]
)
prompt = f"""Analyze these documentation files from a codebase and provide insights.
@@ -710,6 +730,7 @@ Output JSON only:
if result.returncode == 0 and result.stdout:
import re
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
@@ -777,7 +798,9 @@ def analyze_codebase(
if enhance_level > 0:
level_names = {1: "SKILL.md only", 2: "SKILL.md+Architecture+Config", 3: "full"}
logger.info(f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})")
logger.info(
f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})"
)
# Resolve directory to absolute path to avoid relative_to() errors
directory = Path(directory).resolve()
@@ -1341,7 +1364,9 @@ Use this skill when you need to:
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
if extract_docs and (output_dir / "documentation").exists():
skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n"
skill_content += (
"- **Documentation**: `references/documentation/` - Project documentation\n"
)
refs_added = True
if not refs_added:
@@ -1590,7 +1615,15 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
content += f"**Categories:** {len(categories)}\n\n"
# List documents by category (most important first)
priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"]
priority_order = [
"overview",
"architecture",
"guides",
"workflows",
"features",
"api",
"examples",
]
# Sort categories by priority
sorted_categories = []
@@ -1637,6 +1670,7 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
if all_topics:
# Deduplicate and count
from collections import Counter
topic_counts = Counter(all_topics)
top_topics = [t for t, _ in topic_counts.most_common(10)]
content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
@@ -1829,7 +1863,12 @@ Examples:
args = parser.parse_args()
# Handle presets (Phase 1 feature - NEW)
if hasattr(args, "quick") and args.quick and hasattr(args, "comprehensive") and args.comprehensive:
if (
hasattr(args, "quick")
and args.quick
and hasattr(args, "comprehensive")
and args.comprehensive
):
logger.error("❌ Cannot use --quick and --comprehensive together. Choose one.")
return 1

View File

@@ -167,9 +167,7 @@ class ConfigEnhancer:
for setting in cf.get("settings", [])[:5]: # First 5 settings per file
# Support both "type" (from config_extractor) and "value_type" (legacy)
value_type = setting.get("type", setting.get("value_type", "unknown"))
settings_summary.append(
f" - {setting['key']}: {setting['value']} ({value_type})"
)
settings_summary.append(f" - {setting['key']}: {setting['value']} ({value_type})")
# Support both "type" (from config_extractor) and "config_type" (legacy)
config_type = cf.get("type", cf.get("config_type", "unknown"))
@@ -306,7 +304,9 @@ Focus on actionable insights that help developers understand and improve their c
config_type = cf.get("type", cf.get("config_type", "unknown"))
settings_preview = []
for s in cf.get("settings", [])[:3]: # Show first 3 settings
settings_preview.append(f" - {s.get('key', 'unknown')}: {str(s.get('value', ''))[:50]}")
settings_preview.append(
f" - {s.get('key', 'unknown')}: {str(s.get('value', ''))[:50]}"
)
config_data.append(f"""
### {cf["relative_path"]} ({config_type})
@@ -431,9 +431,7 @@ DO NOT explain your work - just write the JSON file directly.
potential_files.append(json_file)
# Try to load the most recent JSON file with expected structure
for json_file in sorted(
potential_files, key=lambda f: f.stat().st_mtime, reverse=True
):
for json_file in sorted(potential_files, key=lambda f: f.stat().st_mtime, reverse=True):
try:
with open(json_file) as f:
data = json.load(f)

View File

@@ -8,7 +8,6 @@ when local config files are not found.
import json
import logging
from pathlib import Path
from typing import Optional
import httpx
@@ -22,7 +21,7 @@ _last_searched_paths = []
def fetch_config_from_api(
config_name: str, destination: str = "configs", timeout: float = 30.0
) -> Optional[Path]:
) -> Path | None:
"""
Fetch a config file from the SkillSeekersWeb.com API.
@@ -65,12 +64,10 @@ def fetch_config_from_api(
# Download the actual config file using download_url from API response
download_url = config_info.get("download_url")
if not download_url:
logger.error(
f"❌ Config '{config_name}' has no download_url. Contact support."
)
logger.error(f"❌ Config '{config_name}' has no download_url. Contact support.")
return None
logger.info(f"📥 Downloading config from API...")
logger.info("📥 Downloading config from API...")
download_response = client.get(download_url)
download_response.raise_for_status()
config_data = download_response.json()
@@ -84,9 +81,7 @@ def fetch_config_from_api(
json.dump(config_data, f, indent=2)
logger.info(f"✅ Config downloaded successfully: {config_file}")
logger.info(
f" Category: {config_info.get('category', 'uncategorized')}"
)
logger.info(f" Category: {config_info.get('category', 'uncategorized')}")
logger.info(f" Type: {config_info.get('type', 'unknown')}")
return config_file
@@ -102,7 +97,7 @@ def fetch_config_from_api(
return None
def list_available_configs(category: Optional[str] = None, timeout: float = 30.0) -> list[str]:
def list_available_configs(category: str | None = None, timeout: float = 30.0) -> list[str]:
"""
List all available configs from the API.
@@ -135,7 +130,7 @@ def list_available_configs(category: Optional[str] = None, timeout: float = 30.0
return []
def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Optional[Path]:
def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Path | None:
"""
Resolve config path with automatic API fallback.
@@ -196,7 +191,7 @@ def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Optional[P
config_name = config_name[8:]
logger.info(
f"\n💡 Config not found locally, attempting to fetch from SkillSeekersWeb.com API..."
"\n💡 Config not found locally, attempting to fetch from SkillSeekersWeb.com API..."
)
fetched_path = fetch_config_from_api(config_name, destination="configs")
if fetched_path and fetched_path.exists():

View File

@@ -1834,7 +1834,9 @@ def load_config(config_path: str) -> dict[str, Any]:
except ValueError as e:
logger.error("❌ Configuration validation errors in %s:", config_path)
logger.error(" %s", str(e))
logger.error("\n Suggestion: Fix the above errors or check https://skillseekersweb.com/ for examples")
logger.error(
"\n Suggestion: Fix the above errors or check https://skillseekersweb.com/ for examples"
)
sys.exit(1)
return config

View File

@@ -869,10 +869,16 @@ class HowToGuideBuilder:
# Filter to workflow examples only
workflows = self._extract_workflow_examples(examples)
logger.info(f"Found {len(workflows)} workflow examples")
logger.info(f"Found {len(workflows)} workflow examples (from {len(examples)} total)")
if not workflows:
logger.warning("No workflow examples found!")
# Log categories for debugging
categories = {ex.get("category", "unknown") for ex in examples}
logger.warning(f"No workflow examples found! Categories in input: {categories}")
logger.info(
"Tip: Workflow detection requires keywords like 'workflow', 'integration', 'e2e' in test names,"
)
logger.info(" or tests with 4+ assignments and 3+ method calls")
return GuideCollection(
total_guides=0,
guides_by_complexity={},

View File

@@ -288,7 +288,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
analyze_parser.add_argument(
"--comprehensive",
action="store_true",
help="Comprehensive analysis (20-60 min, all features + AI)"
help="Comprehensive analysis (20-60 min, all features + AI)",
)
analyze_parser.add_argument(
"--depth",
@@ -300,22 +300,32 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
)
analyze_parser.add_argument("--file-patterns", help="Comma-separated file patterns")
analyze_parser.add_argument(
"--enhance", action="store_true", help="Enable AI enhancement (default level 1 = SKILL.md only)"
"--enhance",
action="store_true",
help="Enable AI enhancement (default level 1 = SKILL.md only)",
)
analyze_parser.add_argument(
"--enhance-level",
type=int,
choices=[0, 1, 2, 3],
default=None,
help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full"
help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full",
)
analyze_parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs")
analyze_parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph")
analyze_parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection")
analyze_parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
analyze_parser.add_argument(
"--skip-dependency-graph", action="store_true", help="Skip dep graph"
)
analyze_parser.add_argument(
"--skip-patterns", action="store_true", help="Skip pattern detection"
)
analyze_parser.add_argument(
"--skip-test-examples", action="store_true", help="Skip test examples"
)
analyze_parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
analyze_parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
analyze_parser.add_argument("--skip-docs", action="store_true", help="Skip project docs (README, docs/)")
analyze_parser.add_argument(
"--skip-docs", action="store_true", help="Skip project docs (README, docs/)"
)
analyze_parser.add_argument("--no-comments", action="store_true", help="Skip comments")
analyze_parser.add_argument("--verbose", action="store_true", help="Verbose logging")
@@ -559,13 +569,16 @@ def main(argv: list[str] | None = None) -> int:
# Handle preset flags (depth and features)
if args.quick:
# Quick = surface depth + skip advanced features + no AI
sys.argv.extend([
"--depth", "surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
])
sys.argv.extend(
[
"--depth",
"surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
]
)
elif args.comprehensive:
# Comprehensive = full depth + all features (AI level is separate)
sys.argv.extend(["--depth", "full"])
@@ -582,6 +595,7 @@ def main(argv: list[str] | None = None) -> int:
# Use default from config (default: 1)
try:
from skill_seekers.cli.config_manager import get_config_manager
config = get_config_manager()
enhance_level = config.get_default_enhance_level()
except Exception:

View File

@@ -792,8 +792,9 @@ class PDFExtractor:
# Use "text" format with layout info for PyMuDF 1.24+
try:
markdown = page.get_text("markdown")
except (AssertionError, ValueError):
# Fallback to text format for older/newer PyMuDF versions
except (AssertionError, ValueError, RuntimeError, TypeError, AttributeError):
# Fallback to text format for incompatible PyMuPDF versions
# Some versions don't support "markdown" format or have internal errors
markdown = page.get_text(
"text",
flags=fitz.TEXT_PRESERVE_WHITESPACE

View File

@@ -577,8 +577,36 @@ class PythonTestAnalyzer:
def _is_integration_test(self, func_node: ast.FunctionDef) -> bool:
"""Check if test looks like an integration test"""
test_name = func_node.name.lower()
integration_keywords = ["workflow", "integration", "end_to_end", "e2e", "full"]
return any(keyword in test_name for keyword in integration_keywords)
# Expanded keyword list for better workflow detection
integration_keywords = [
"workflow",
"integration",
"end_to_end",
"e2e",
"full",
"complete",
"scenario",
"flow",
"multi_step",
"multistep",
"process",
"chain",
"sequence",
"pipeline",
"lifecycle",
]
# Check test name for keywords
if any(keyword in test_name for keyword in integration_keywords):
return True
# Heuristic: tests with 4+ assignments and 3+ calls are likely workflows
assignments = sum(
1 for n in ast.walk(func_node) if isinstance(n, (ast.Assign, ast.AugAssign))
)
calls = sum(1 for n in ast.walk(func_node) if isinstance(n, ast.Call))
return assignments >= 4 and calls >= 3
def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str:
"""Find assertion that follows the target node"""
@@ -771,7 +799,11 @@ class GenericTestAnalyzer:
# Find next method (setup or test)
next_pattern = patterns.get("setup", patterns["test_function"])
next_setup = re.search(next_pattern, code[setup_start:])
setup_end = setup_start + next_setup.start() if next_setup else min(setup_start + 500, len(code))
setup_end = (
setup_start + next_setup.start()
if next_setup
else min(setup_start + 500, len(code))
)
setup_body = code[setup_start:setup_end]
example = self._create_example(

View File

@@ -616,7 +616,8 @@ This skill combines knowledge from multiple sources:
if isinstance(github_data, dict):
github_data = github_data.get("data", {})
elif isinstance(github_data, list) and len(github_data) > 0:
github_data = github_data[0].get("data", {})
first_item = github_data[0]
github_data = first_item.get("data", {}) if isinstance(first_item, dict) else {}
else:
github_data = {}

View File

@@ -11,7 +11,7 @@ Tools are organized by functionality:
- source_tools: Config source management (fetch, submit, add/remove sources)
"""
__version__ = "2.7.2"
__version__ = "2.7.4"
from .config_tools import (
generate_config as generate_config_impl,