fix: remove arbitrary limits, fix hardcoded languages, and fix summarizer bugs

Stage 1 quality improvements from the Arbitrary Limits & Dead Code audit: Reference file truncation removed: - codebase_scraper.py: remove code[:500] truncation at 5 locations — reference files now contain complete code blocks for copy-paste usability - unified_skill_builder.py: remove issues[:20], releases[:10], body[:500], and code_snippet[:300] caps in reference files — full content preserved Enhancement summarizer rewrite: - enhance_skill_local.py: replace arbitrary [:5] code block cap with character-budget approach using target_ratio * content_chars - Fix intro boundary bug: track code block state so intro never ends inside a code block, which was desynchronizing the parser - Remove dead _target_lines variable (assigned but never used) - Heading chunks now also respect the character budget Hardcoded language fixes: - unified_skill_builder.py: test examples use ex["language"] instead of always "python" for syntax highlighting - how_to_guide_builder.py: add language field to HowToGuide dataclass, set from workflow at creation, used in AI enhancement prompt Test fixes: - test_enhance_skill_local.py: rename test to test_code_blocks_not_arbitrarily_capped, fix assertion to count actual blocks (```count // 2), use target_ratio=0.9 Documentation: - Add Stage 1 plan, implementation summary, review, and corrected docs - Update CHANGELOG.md with all changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 00:30:40 +03:00
parent b81d55fda0
commit b6d4dd8423
10 changed files with 1189 additions and 20 deletions
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -419,7 +419,7 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
            structure["code_blocks"].append(
                {
                    "language": language,
-                    "code": code[:500],  # Truncate long code blocks
+                    "code": code,  # Full code - no truncation
                    "full_length": len(code),
                }
            )
@@ -486,7 +486,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
                "code_blocks": [
                    {
                        "language": cb.language or "text",
-                        "code": cb.code[:500] if len(cb.code) > 500 else cb.code,
+                        "code": cb.code,  # Full code - no truncation
                        "full_length": len(cb.code),
                        "quality_score": cb.quality_score,
                    }
@@ -572,7 +572,7 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
            structure["code_blocks"].append(
                {
                    "language": language,
-                    "code": code[:500],
+                    "code": code,  # Full code - no truncation
                    "full_length": len(code),
                }
            )
@@ -717,7 +717,7 @@ def process_markdown_docs(
                                    for h in parsed_doc.headings
                                ],
                                "code_blocks": [
-                                    {"language": cb.language, "code": cb.code[:500]}
+                                    {"language": cb.language, "code": cb.code}  # Full code
                                    for cb in parsed_doc.code_blocks
                                ],
                                "tables": len(parsed_doc.tables),
@@ -743,7 +743,7 @@ def process_markdown_docs(
                                    for h in parsed_doc.headings
                                ],
                                "code_blocks": [
-                                    {"language": cb.language, "code": cb.code[:500]}
+                                    {"language": cb.language, "code": cb.code}  # Full code
                                    for cb in parsed_doc.code_blocks
                                ],
                                "tables": len(parsed_doc.tables),
--- a/src/skill_seekers/cli/enhance_skill_local.py
+++ b/src/skill_seekers/cli/enhance_skill_local.py
@@ -306,10 +306,19 @@ class LocalSkillEnhancer:
            Summarized content
        """
        lines = content.split("\n")
-        _target_lines = int(len(lines) * target_ratio)

        # Priority 1: Keep introduction (first 20%)
        intro_lines = int(len(lines) * 0.2)
+
+        # Ensure intro doesn't cut inside a code block
+        in_block = False
+        safe_end = 0
+        for i in range(intro_lines):
+            if lines[i].strip().startswith("```"):
+                in_block = not in_block
+            if not in_block:
+                safe_end = i + 1
+        intro_lines = safe_end
        result_lines = lines[:intro_lines]

        # Priority 2: Extract code blocks
@@ -334,13 +343,21 @@ class LocalSkillEnhancer:
            elif in_code_block:
                current_block.append(line)

-        # Combine: intro + code blocks + headings
+        # Combine: intro + code blocks + headings with token budget
        result = result_lines.copy()
+        # Budget is target_ratio of original content length
+        content_chars = len(content)
+        max_chars = int(content_chars * target_ratio)
+        current_chars = sum(len(line) for line in result)

-        # Add code blocks first (prioritize code examples)
-        for _idx, block in code_blocks[:5]:  # Max 5 code blocks
+        # Priority 2: Add code blocks first (prioritize code examples) - no arbitrary limit
+        for _idx, block in code_blocks:
+            block_chars = sum(len(line) for line in block) + 1  # +1 for blank line
+            if current_chars + block_chars > max_chars:
+                break
            result.append("")  # Add blank line before code block
            result.extend(block)
+            current_chars += block_chars

        # Priority 3: Keep headings with first paragraph
        i = intro_lines
@@ -350,8 +367,12 @@ class LocalSkillEnhancer:
            if line.startswith("#"):
                # Found heading - keep it and next 3 lines
                chunk = lines[i : min(i + 4, len(lines))]
+                chunk_chars = sum(len(l) for l in chunk)
+                if current_chars + chunk_chars > max_chars:
+                    break
                result.extend(chunk)
                headings_added += 1
+                current_chars += chunk_chars
                i += 4
            else:
                i += 1
--- a/src/skill_seekers/cli/how_to_guide_builder.py
+++ b/src/skill_seekers/cli/how_to_guide_builder.py
@@ -105,6 +105,7 @@ class HowToGuide:
    tags: list[str] = field(default_factory=list)
    estimated_time: str = "10 minutes"
    source_files: list[str] = field(default_factory=list)
+    language: str = "python"  # Source file language

    # Optional AI enhancement (basic)
    common_pitfalls: list[str] = field(default_factory=list)
@@ -966,6 +967,7 @@ class HowToGuideBuilder:
            tags=tags,
            estimated_time=metadata.get("estimated_time", "10 minutes"),
            source_files=source_files,
+            language=primary_workflow.get("language", "python"),
        )

        # Add AI enhancements if enhancer is available
@@ -1015,7 +1017,7 @@ class HowToGuideBuilder:
        guide_data = {
            "title": guide.title,
            "steps": [{"description": step.description, "code": step.code} for step in guide.steps],
-            "language": "python",  # TODO: Detect from code
+            "language": guide.language,
            "prerequisites": guide.prerequisites,
            "description": guide.overview,
        }
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -907,7 +907,7 @@ This skill combines knowledge from multiple sources:
                    f.write(f"# GitHub Issues: {repo}\n\n")
                    f.write(f"{len(github_data['issues'])} recent issues.\n\n")

-                    for issue in github_data["issues"][:20]:
+                    for issue in github_data["issues"]:  # All issues, no arbitrary limit
                        f.write(f"## #{issue['number']}: {issue['title']}\n\n")
                        f.write(f"**State**: {issue['state']}\n")
                        if issue.get("labels"):
@@ -920,11 +920,11 @@ This skill combines knowledge from multiple sources:
                with open(releases_path, "w", encoding="utf-8") as f:
                    f.write(f"# Releases: {repo}\n\n")

-                    for release in github_data["releases"][:10]:
+                    for release in github_data["releases"]:  # All releases, no arbitrary limit
                        f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n")
                        f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n")
                        if release.get("body"):
-                            f.write(release["body"][:500])
+                            f.write(release["body"])  # Full release notes
                            f.write("\n\n")

            # Create index for this repo
@@ -1295,7 +1295,8 @@ This skill combines knowledge from multiple sources:
                    f.write(f"- **Confidence**: {ex.get('confidence', 0):.2f}\n")
                    f.write(f"- **File**: `{ex.get('file_path', 'N/A')}`\n")
                    if ex.get("code_snippet"):
-                        f.write(f"\n```python\n{ex['code_snippet'][:300]}\n```\n")
+                        lang = ex.get("language", "text")
+                        f.write(f"\n```{lang}\n{ex['code_snippet']}\n```\n")  # Full code, no truncation
                    f.write("\n")

        logger.info(f"   ✓ Test examples: {total} total, {high_value} high-value")