fix: Resolve PDF processing (#267), How-To Guide (#242), Chinese README (#260) + code quality (#273)

Thanks @franklegolasyoung for the excellent work on the core fixes for issues #267, #242, and #260! 🙏

Your comprehensive approach to fixing PDF processing, expanding workflow detection, and improving the Chinese README documentation is much appreciated. I've added code quality fixes and comprehensive tests to ensure everything passes CI.

All 1266+ tests are now passing, and the issues are resolved! 🎉
This commit is contained in:
yusyus
2026-01-31 21:30:00 +03:00
committed by GitHub
parent f726a9abc5
commit 91bd2184e5
19 changed files with 622 additions and 174 deletions

View File

@@ -377,11 +377,13 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
if header_match:
level = len(header_match.group(1))
text = header_match.group(2).strip()
structure["headers"].append({
"level": level,
"text": text,
"line": i + 1,
})
structure["headers"].append(
{
"level": level,
"text": text,
"line": i + 1,
}
)
# First h1 is the title
if level == 1 and structure["title"] is None:
structure["title"] = text
@@ -392,24 +394,30 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
language = match.group(1) or "text"
code = match.group(2).strip()
if len(code) > 0:
structure["code_blocks"].append({
"language": language,
"code": code[:500], # Truncate long code blocks
"full_length": len(code),
})
structure["code_blocks"].append(
{
"language": language,
"code": code[:500], # Truncate long code blocks
"full_length": len(code),
}
)
# Extract links
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
for match in link_pattern.finditer(content):
structure["links"].append({
"text": match.group(1),
"url": match.group(2),
})
structure["links"].append(
{
"text": match.group(1),
"url": match.group(2),
}
)
return structure
def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str:
def generate_markdown_summary(
content: str, structure: dict[str, Any], max_length: int = 500
) -> str:
"""
Generate a summary of markdown content.
@@ -522,12 +530,14 @@ def process_markdown_docs(
structure = extract_markdown_structure(content)
summary = generate_markdown_summary(content, structure)
doc_data.update({
"title": structure.get("title") or md_path.stem,
"structure": structure,
"summary": summary,
"content": content if depth == "full" else None,
})
doc_data.update(
{
"title": structure.get("title") or md_path.stem,
"structure": structure,
"summary": summary,
"content": content if depth == "full" else None,
}
)
processed_docs.append(doc_data)
# Track categories
@@ -563,6 +573,7 @@ def process_markdown_docs(
# Copy file to category folder
dest_path = category_dir / doc["filename"]
import shutil
shutil.copy2(src_path, dest_path)
except Exception as e:
logger.debug(f"Failed to copy {doc['path']}: {e}")
@@ -578,7 +589,9 @@ def process_markdown_docs(
with open(index_json, "w", encoding="utf-8") as f:
json.dump(index_data, f, indent=2, default=str)
logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories")
logger.info(
f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories"
)
logger.info(f"📁 Saved to: {docs_output_dir}")
return index_data
@@ -612,18 +625,22 @@ def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
"""Enhance docs using Claude API."""
try:
import anthropic
client = anthropic.Anthropic(api_key=api_key)
# Batch documents for efficiency
batch_size = 10
for i in range(0, len(docs), batch_size):
batch = docs[i:i + batch_size]
batch = docs[i : i + batch_size]
# Create prompt for batch
docs_text = "\n\n".join([
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
for d in batch if d.get("summary")
])
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
for d in batch
if d.get("summary")
]
)
if not docs_text:
continue
@@ -642,12 +659,13 @@ Return JSON with format:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}]
messages=[{"role": "user", "content": prompt}],
)
# Parse response and merge enhancements
try:
import re
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
@@ -676,10 +694,12 @@ def _enhance_docs_local(docs: list[dict]) -> list[dict]:
if not docs_with_summary:
return docs
docs_text = "\n\n".join([
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
for d in docs_with_summary[:20] # Limit to 20 docs
])
docs_text = "\n\n".join(
[
f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
for d in docs_with_summary[:20] # Limit to 20 docs
]
)
prompt = f"""Analyze these documentation files from a codebase and provide insights.
@@ -710,6 +730,7 @@ Output JSON only:
if result.returncode == 0 and result.stdout:
import re
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
@@ -777,7 +798,9 @@ def analyze_codebase(
if enhance_level > 0:
level_names = {1: "SKILL.md only", 2: "SKILL.md+Architecture+Config", 3: "full"}
logger.info(f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})")
logger.info(
f"🤖 AI Enhancement Level: {enhance_level} ({level_names.get(enhance_level, 'unknown')})"
)
# Resolve directory to absolute path to avoid relative_to() errors
directory = Path(directory).resolve()
@@ -1341,7 +1364,9 @@ Use this skill when you need to:
skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
refs_added = True
if extract_docs and (output_dir / "documentation").exists():
skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n"
skill_content += (
"- **Documentation**: `references/documentation/` - Project documentation\n"
)
refs_added = True
if not refs_added:
@@ -1590,7 +1615,15 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
content += f"**Categories:** {len(categories)}\n\n"
# List documents by category (most important first)
priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"]
priority_order = [
"overview",
"architecture",
"guides",
"workflows",
"features",
"api",
"examples",
]
# Sort categories by priority
sorted_categories = []
@@ -1637,6 +1670,7 @@ def _format_documentation_section(_output_dir: Path, docs_data: dict[str, Any])
if all_topics:
# Deduplicate and count
from collections import Counter
topic_counts = Counter(all_topics)
top_topics = [t for t, _ in topic_counts.most_common(10)]
content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
@@ -1829,7 +1863,12 @@ Examples:
args = parser.parse_args()
# Handle presets (Phase 1 feature - NEW)
if hasattr(args, "quick") and args.quick and hasattr(args, "comprehensive") and args.comprehensive:
if (
hasattr(args, "quick")
and args.quick
and hasattr(args, "comprehensive")
and args.comprehensive
):
logger.error("❌ Cannot use --quick and --comprehensive together. Choose one.")
return 1