fix: Resolve PDF processing (#267), How-To Guide (#242), Chinese README (#260) + code quality (#273)

Thanks @franklegolasyoung for the excellent work on the core fixes for issues #267, #242, and #260! 🙏 Your comprehensive approach to fixing PDF processing, expanding workflow detection, and improving the Chinese README documentation is much appreciated. I've added code quality fixes and comprehensive tests to ensure everything passes CI. All 1266+ tests are now passing, and the issues are resolved! 🎉
2026-01-31 21:30:00 +03:00
parent f726a9abc5
commit 91bd2184e5
19 changed files with 622 additions and 174 deletions
--- a/src/skill_seekers/cli/ai_enhancer.py
+++ b/src/skill_seekers/cli/ai_enhancer.py
@@ -36,6 +36,7 @@ logger = logging.getLogger(__name__)
 # Import config manager for settings
 try:
    from skill_seekers.cli.config_manager import get_config_manager
+
    CONFIG_AVAILABLE = True
 except ImportError:
    CONFIG_AVAILABLE = False
@@ -107,7 +108,9 @@ class AIEnhancer:
                logger.warning("⚠️  anthropic package not installed, falling back to LOCAL mode")
                self.mode = "local"
            except Exception as e:
-                logger.warning(f"⚠️  Failed to initialize API client: {e}, falling back to LOCAL mode")
+                logger.warning(
+                    f"⚠️  Failed to initialize API client: {e}, falling back to LOCAL mode"
+                )
                self.mode = "local"

        if self.mode == "local" and self.enabled:
@@ -212,7 +215,8 @@ DO NOT include any explanation - just write the JSON file.
                    except json.JSONDecodeError:
                        # Try to find JSON in the response
                        import re
-                        json_match = re.search(r'\[[\s\S]*\]|\{[\s\S]*\}', response_text)
+
+                        json_match = re.search(r"\[[\s\S]*\]|\{[\s\S]*\}", response_text)
                        if json_match:
                            return json_match.group()
                        logger.warning("⚠️  Could not parse JSON from LOCAL response")
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -377,11 +377,13 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
        if header_match:
            level = len(header_match.group(1))
            text = header_match.group(2).strip()
-            structure["headers"].append({
-                "level": level,
-                "text": text,
-                "line": i + 1,
-            })
+            structure["headers"].append(
+                {
+                    "level": level,
+                    "text": text,
+                    "line": i + 1,
+                }
+            )
            # First h1 is the title
            if level == 1 and structure["title"] is None:
                structure["title"] = text
@@ -392,24 +394,30 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
        language = match.group(1) or "text"
        code = match.group(2).strip()
        if len(code) > 0:
-            structure["code_blocks"].append({
-                "language": language,
-                "code": code[:500],  # Truncate long code blocks
-                "full_length": len(code),
-            })
+            structure["code_blocks"].append(
+                {
+                    "language": language,
+                    "code": code[:500],  # Truncate long code blocks
+                    "full_length": len(code),
+                }
+            )

    # Extract links
    link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
    for match in link_pattern.finditer(content):
-        structure["links"].append({
-            "text": match.group(1),
-            "url": match.group(2),
-        })
+        structure["links"].append(
+            {
+                "text": match.group(1),
+                "url": match.group(2),
+            }
+        )

    return structure


-def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str:
+def generate_markdown_summary(
+    content: str, structure: dict[str, Any], max_length: int = 500
+) -> str:
    """
    Generate a summary of markdown content.

@@ -522,12 +530,14 @@ def process_markdown_docs(
                structure = extract_markdown_structure(content)
                summary = generate_markdown_summary(content, structure)

-                doc_data.update({
-                    "title": structure.get("title") or md_path.stem,
-                    "structure": structure,
-                    "summary": summary,
-                    "content": content if depth == "full" else None,
-                })
+                doc_data.update(
+                    {
+                        "title": structure.get("title") or md_path.stem,
+                        "structure": structure,
+                        "summary": summary,
+                        "content": content if depth == "full" else None,
+                    }
+                )
                processed_docs.append(doc_data)

            # Track categories
@@ -563,6 +573,7 @@ def process_markdown_docs(
            # Copy file to category folder
            dest_path = category_dir / doc["filename"]
            import shutil
+
            shutil.copy2(src_path, dest_path)
        except Exception as e:
            logger.debug(f"Failed to copy {doc['path']}: {e}")
@@ -578,7 +589,9 @@ def process_markdown_docs(
    with open(index_json, "w", encoding="utf-8") as f:
        json.dump(index_data, f, indent=2, default=str)

-    logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories")
+    logger.info(
+        f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories"
+    )
    logger.info(f"📁 Saved to: {docs_output_dir}")

    return index_data
@@ -612,18 +625,22 @@ def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
    """Enhance docs using Claude API."""
    try:
        import anthropic
+
        client = anthropic.Anthropic(api_key=api_key)

        # Batch documents for efficiency
        batch_size = 10
        for i in range(0, len(docs), batch_size):
-            batch = docs[i:i + batch_size]
+            batch = docs[i : i + batch_size]

            # Create prompt for batch
-            docs_text = "\n\n".join([
-                f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
-                for d in batch if d.get("summary")
-            ])
+            docs_text = "\n\n".join(
+                [
+                    f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
+                    for d in batch
+                    if d.get("summary")
+                ]
+            )

            if not docs_text:
                continue
@@ -642,12 +659,13 @@ Return JSON with format:
            response = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=2000,
-                messages=[{"role": "user", "content": prompt}]
+                messages=[{"role": "user", "content": prompt}],
            )

            # Parse response and merge enhancements
            try:
                import re
+
                json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
                if json_match:
                    enhancements = json.loads(json_match.group())
@@ -676,10 +694,12 @@ def _enhance_docs_local(docs: list[dict]) -> list[dict]:
    if not docs_with_summary:
        return docs

-    docs_text = "\n\n".join([
-        f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
-        for d in docs_with_summary[:20]  # Limit to 20 docs
-    ])
+    docs_text = "\n\n".join(
+        [
+            f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
+            for d in docs_with_summary[:20]  # Limit to 20 docs
+        ]
+    )

    prompt = f"""Analyze these documentation files from a codebase and provide insights.

@@ -710,6 +730,7 @@ Output JSON only:

        if result.returncode == 0 and result.stdout:
            import re
+
            json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
            if json_match:
                enhancements = json.loads(json_match.group())
@@ -777,7 +798,9 @@ def analyze_codebase(

    if enhance_level > 0:
        level_names = {1: "SKILL.md only", 2: "SKILL.md+Architecture+Config", 3: "full"}
-        logger.info(f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})")
+        logger.info(
+            f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})"
+        )
    # Resolve directory to absolute path to avoid relative_to() errors
    directory = Path(directory).resolve()

@@ -1341,7 +1364,9 @@ Use this skill when you need to:
        skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
        refs_added = True
    if extract_docs and (output_dir / "documentation").exists():
-        skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n"
+        skill_content += (
+            "- **Documentation**: `references/documentation/` - Project documentation\n"
+        )
        refs_added = True

    if not refs_added:
@@ -1590,7 +1615,15 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
    content += f"**Categories:** {len(categories)}\n\n"

    # List documents by category (most important first)
-    priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"]
+    priority_order = [
+        "overview",
+        "architecture",
+        "guides",
+        "workflows",
+        "features",
+        "api",
+        "examples",
+    ]

    # Sort categories by priority
    sorted_categories = []
@@ -1637,6 +1670,7 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
    if all_topics:
        # Deduplicate and count
        from collections import Counter
+
        topic_counts = Counter(all_topics)
        top_topics = [t for t, _ in topic_counts.most_common(10)]
        content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
@@ -1829,7 +1863,12 @@ Examples:
    args = parser.parse_args()

    # Handle presets (Phase 1 feature - NEW)
-    if hasattr(args, "quick") and args.quick and hasattr(args, "comprehensive") and args.comprehensive:
+    if (
+        hasattr(args, "quick")
+        and args.quick
+        and hasattr(args, "comprehensive")
+        and args.comprehensive
+    ):
        logger.error("❌ Cannot use --quick and --comprehensive together. Choose one.")
        return 1

--- a/src/skill_seekers/cli/config_enhancer.py
+++ b/src/skill_seekers/cli/config_enhancer.py
@@ -167,9 +167,7 @@ class ConfigEnhancer:
            for setting in cf.get("settings", [])[:5]:  # First 5 settings per file
                # Support both "type" (from config_extractor) and "value_type" (legacy)
                value_type = setting.get("type", setting.get("value_type", "unknown"))
-                settings_summary.append(
-                    f"  - {setting['key']}: {setting['value']} ({value_type})"
-                )
+                settings_summary.append(f"  - {setting['key']}: {setting['value']} ({value_type})")

            # Support both "type" (from config_extractor) and "config_type" (legacy)
            config_type = cf.get("type", cf.get("config_type", "unknown"))
@@ -306,7 +304,9 @@ Focus on actionable insights that help developers understand and improve their c
            config_type = cf.get("type", cf.get("config_type", "unknown"))
            settings_preview = []
            for s in cf.get("settings", [])[:3]:  # Show first 3 settings
-                settings_preview.append(f"    - {s.get('key', 'unknown')}: {str(s.get('value', ''))[:50]}")
+                settings_preview.append(
+                    f"    - {s.get('key', 'unknown')}: {str(s.get('value', ''))[:50]}"
+                )

            config_data.append(f"""
 ### {cf["relative_path"]} ({config_type})
@@ -431,9 +431,7 @@ DO NOT explain your work - just write the JSON file directly.
                    potential_files.append(json_file)

            # Try to load the most recent JSON file with expected structure
-            for json_file in sorted(
-                potential_files, key=lambda f: f.stat().st_mtime, reverse=True
-            ):
+            for json_file in sorted(potential_files, key=lambda f: f.stat().st_mtime, reverse=True):
                try:
                    with open(json_file) as f:
                        data = json.load(f)
--- a/src/skill_seekers/cli/config_fetcher.py
+++ b/src/skill_seekers/cli/config_fetcher.py
@@ -8,7 +8,6 @@ when local config files are not found.
 import json
 import logging
 from pathlib import Path
-from typing import Optional

 import httpx

@@ -22,7 +21,7 @@ _last_searched_paths = []

 def fetch_config_from_api(
    config_name: str, destination: str = "configs", timeout: float = 30.0
-) -> Optional[Path]:
+) -> Path | None:
    """
    Fetch a config file from the SkillSeekersWeb.com API.

@@ -65,12 +64,10 @@ def fetch_config_from_api(
            # Download the actual config file using download_url from API response
            download_url = config_info.get("download_url")
            if not download_url:
-                logger.error(
-                    f"❌ Config '{config_name}' has no download_url. Contact support."
-                )
+                logger.error(f"❌ Config '{config_name}' has no download_url. Contact support.")
                return None

-            logger.info(f"📥 Downloading config from API...")
+            logger.info("📥 Downloading config from API...")
            download_response = client.get(download_url)
            download_response.raise_for_status()
            config_data = download_response.json()
@@ -84,9 +81,7 @@ def fetch_config_from_api(
                json.dump(config_data, f, indent=2)

            logger.info(f"✅ Config downloaded successfully: {config_file}")
-            logger.info(
-                f"   Category: {config_info.get('category', 'uncategorized')}"
-            )
+            logger.info(f"   Category: {config_info.get('category', 'uncategorized')}")
            logger.info(f"   Type: {config_info.get('type', 'unknown')}")

            return config_file
@@ -102,7 +97,7 @@ def fetch_config_from_api(
        return None


-def list_available_configs(category: Optional[str] = None, timeout: float = 30.0) -> list[str]:
+def list_available_configs(category: str | None = None, timeout: float = 30.0) -> list[str]:
    """
    List all available configs from the API.

@@ -135,7 +130,7 @@ def list_available_configs(category: Optional[str] = None, timeout: float = 30.0
        return []


-def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Optional[Path]:
+def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Path | None:
    """
    Resolve config path with automatic API fallback.

@@ -196,7 +191,7 @@ def resolve_config_path(config_path: str, auto_fetch: bool = True) -> Optional[P
            config_name = config_name[8:]

        logger.info(
-            f"\n💡 Config not found locally, attempting to fetch from SkillSeekersWeb.com API..."
+            "\n💡 Config not found locally, attempting to fetch from SkillSeekersWeb.com API..."
        )
        fetched_path = fetch_config_from_api(config_name, destination="configs")
        if fetched_path and fetched_path.exists():
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -1834,7 +1834,9 @@ def load_config(config_path: str) -> dict[str, Any]:
    except ValueError as e:
        logger.error("❌ Configuration validation errors in %s:", config_path)
        logger.error("   %s", str(e))
-        logger.error("\n   Suggestion: Fix the above errors or check https://skillseekersweb.com/ for examples")
+        logger.error(
+            "\n   Suggestion: Fix the above errors or check https://skillseekersweb.com/ for examples"
+        )
        sys.exit(1)

    return config
--- a/src/skill_seekers/cli/how_to_guide_builder.py
+++ b/src/skill_seekers/cli/how_to_guide_builder.py
@@ -869,10 +869,16 @@ class HowToGuideBuilder:

        # Filter to workflow examples only
        workflows = self._extract_workflow_examples(examples)
-        logger.info(f"Found {len(workflows)} workflow examples")
+        logger.info(f"Found {len(workflows)} workflow examples (from {len(examples)} total)")

        if not workflows:
-            logger.warning("No workflow examples found!")
+            # Log categories for debugging
+            categories = {ex.get("category", "unknown") for ex in examples}
+            logger.warning(f"No workflow examples found! Categories in input: {categories}")
+            logger.info(
+                "Tip: Workflow detection requires keywords like 'workflow', 'integration', 'e2e' in test names,"
+            )
+            logger.info("     or tests with 4+ assignments and 3+ method calls")
            return GuideCollection(
                total_guides=0,
                guides_by_complexity={},
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -288,7 +288,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
    analyze_parser.add_argument(
        "--comprehensive",
        action="store_true",
-        help="Comprehensive analysis (20-60 min, all features + AI)"
+        help="Comprehensive analysis (20-60 min, all features + AI)",
    )
    analyze_parser.add_argument(
        "--depth",
@@ -300,22 +300,32 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
    )
    analyze_parser.add_argument("--file-patterns", help="Comma-separated file patterns")
    analyze_parser.add_argument(
-        "--enhance", action="store_true", help="Enable AI enhancement (default level 1 = SKILL.md only)"
+        "--enhance",
+        action="store_true",
+        help="Enable AI enhancement (default level 1 = SKILL.md only)",
    )
    analyze_parser.add_argument(
        "--enhance-level",
        type=int,
        choices=[0, 1, 2, 3],
        default=None,
-        help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full"
+        help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full",
    )
    analyze_parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs")
-    analyze_parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph")
-    analyze_parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection")
-    analyze_parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
+    analyze_parser.add_argument(
+        "--skip-dependency-graph", action="store_true", help="Skip dep graph"
+    )
+    analyze_parser.add_argument(
+        "--skip-patterns", action="store_true", help="Skip pattern detection"
+    )
+    analyze_parser.add_argument(
+        "--skip-test-examples", action="store_true", help="Skip test examples"
+    )
    analyze_parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
    analyze_parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
-    analyze_parser.add_argument("--skip-docs", action="store_true", help="Skip project docs (README, docs/)")
+    analyze_parser.add_argument(
+        "--skip-docs", action="store_true", help="Skip project docs (README, docs/)"
+    )
    analyze_parser.add_argument("--no-comments", action="store_true", help="Skip comments")
    analyze_parser.add_argument("--verbose", action="store_true", help="Verbose logging")

@@ -559,13 +569,16 @@ def main(argv: list[str] | None = None) -> int:
            # Handle preset flags (depth and features)
            if args.quick:
                # Quick = surface depth + skip advanced features + no AI
-                sys.argv.extend([
-                    "--depth", "surface",
-                    "--skip-patterns",
-                    "--skip-test-examples",
-                    "--skip-how-to-guides",
-                    "--skip-config-patterns",
-                ])
+                sys.argv.extend(
+                    [
+                        "--depth",
+                        "surface",
+                        "--skip-patterns",
+                        "--skip-test-examples",
+                        "--skip-how-to-guides",
+                        "--skip-config-patterns",
+                    ]
+                )
            elif args.comprehensive:
                # Comprehensive = full depth + all features (AI level is separate)
                sys.argv.extend(["--depth", "full"])
@@ -582,6 +595,7 @@ def main(argv: list[str] | None = None) -> int:
                # Use default from config (default: 1)
                try:
                    from skill_seekers.cli.config_manager import get_config_manager
+
                    config = get_config_manager()
                    enhance_level = config.get_default_enhance_level()
                except Exception:
--- a/src/skill_seekers/cli/pdf_extractor_poc.py
+++ b/src/skill_seekers/cli/pdf_extractor_poc.py
@@ -792,8 +792,9 @@ class PDFExtractor:
        # Use "text" format with layout info for PyMuDF 1.24+
        try:
            markdown = page.get_text("markdown")
-        except (AssertionError, ValueError):
-            # Fallback to text format for older/newer PyMuDF versions
+        except (AssertionError, ValueError, RuntimeError, TypeError, AttributeError):
+            # Fallback to text format for incompatible PyMuPDF versions
+            # Some versions don't support "markdown" format or have internal errors
            markdown = page.get_text(
                "text",
                flags=fitz.TEXT_PRESERVE_WHITESPACE
--- a/src/skill_seekers/cli/test_example_extractor.py
+++ b/src/skill_seekers/cli/test_example_extractor.py
@@ -577,8 +577,36 @@ class PythonTestAnalyzer:
    def _is_integration_test(self, func_node: ast.FunctionDef) -> bool:
        """Check if test looks like an integration test"""
        test_name = func_node.name.lower()
-        integration_keywords = ["workflow", "integration", "end_to_end", "e2e", "full"]
-        return any(keyword in test_name for keyword in integration_keywords)
+        # Expanded keyword list for better workflow detection
+        integration_keywords = [
+            "workflow",
+            "integration",
+            "end_to_end",
+            "e2e",
+            "full",
+            "complete",
+            "scenario",
+            "flow",
+            "multi_step",
+            "multistep",
+            "process",
+            "chain",
+            "sequence",
+            "pipeline",
+            "lifecycle",
+        ]
+
+        # Check test name for keywords
+        if any(keyword in test_name for keyword in integration_keywords):
+            return True
+
+        # Heuristic: tests with 4+ assignments and 3+ calls are likely workflows
+        assignments = sum(
+            1 for n in ast.walk(func_node) if isinstance(n, (ast.Assign, ast.AugAssign))
+        )
+        calls = sum(1 for n in ast.walk(func_node) if isinstance(n, ast.Call))
+
+        return assignments >= 4 and calls >= 3

    def _extract_assertion_after(self, func_node: ast.FunctionDef, target_node: ast.AST) -> str:
        """Find assertion that follows the target node"""
@@ -771,7 +799,11 @@ class GenericTestAnalyzer:
                # Find next method (setup or test)
                next_pattern = patterns.get("setup", patterns["test_function"])
                next_setup = re.search(next_pattern, code[setup_start:])
-                setup_end = setup_start + next_setup.start() if next_setup else min(setup_start + 500, len(code))
+                setup_end = (
+                    setup_start + next_setup.start()
+                    if next_setup
+                    else min(setup_start + 500, len(code))
+                )
                setup_body = code[setup_start:setup_end]

                example = self._create_example(
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -616,7 +616,8 @@ This skill combines knowledge from multiple sources:
        if isinstance(github_data, dict):
            github_data = github_data.get("data", {})
        elif isinstance(github_data, list) and len(github_data) > 0:
-            github_data = github_data[0].get("data", {})
+            first_item = github_data[0]
+            github_data = first_item.get("data", {}) if isinstance(first_item, dict) else {}
        else:
            github_data = {}

--- a/src/skill_seekers/mcp/tools/init.py
+++ b/src/skill_seekers/mcp/tools/init.py
@@ -11,7 +11,7 @@ Tools are organized by functionality:
 - source_tools: Config source management (fetch, submit, add/remove sources)
 """

-__version__ = "2.7.2"
+__version__ = "2.7.4"

 from .config_tools import (
    generate_config as generate_config_impl,