style: Format code with ruff
- Format 5 files affected by PDF scraper changes - Ensures CI/CD code quality checks pass Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1921,7 +1921,9 @@ def setup_argument_parser() -> argparse.ArgumentParser:
|
||||
help="Load configuration from file (e.g., configs/godot.json)",
|
||||
)
|
||||
parser.add_argument("--name", type=str, help="Skill name")
|
||||
parser.add_argument("--url", type=str, help="Base documentation URL (alternative to positional URL)")
|
||||
parser.add_argument(
|
||||
"--url", type=str, help="Base documentation URL (alternative to positional URL)"
|
||||
)
|
||||
parser.add_argument("--description", "-d", type=str, help="Skill description")
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
@@ -2028,7 +2030,7 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"""
|
||||
# Handle URL from either positional argument or --url flag
|
||||
# Positional 'url' takes precedence, then --url flag
|
||||
effective_url = getattr(args, 'url', None)
|
||||
effective_url = getattr(args, "url", None)
|
||||
|
||||
# Get base configuration
|
||||
if args.config:
|
||||
@@ -2095,9 +2097,7 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
||||
logger.warning(
|
||||
"⚠️ --max-pages=%d is very high - scraping may take hours", args.max_pages
|
||||
)
|
||||
logger.warning(
|
||||
" Recommendation: Use configs with reasonable limits for production"
|
||||
)
|
||||
logger.warning(" Recommendation: Use configs with reasonable limits for production")
|
||||
elif args.max_pages < 10:
|
||||
logger.warning(
|
||||
"⚠️ --max-pages=%d is very low - may result in incomplete skill", args.max_pages
|
||||
|
||||
@@ -101,7 +101,9 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
scrape_parser.add_argument("--config", help="Config JSON file")
|
||||
scrape_parser.add_argument("--name", help="Skill name")
|
||||
scrape_parser.add_argument("--description", help="Skill description")
|
||||
scrape_parser.add_argument("--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)")
|
||||
scrape_parser.add_argument(
|
||||
"--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)"
|
||||
)
|
||||
scrape_parser.add_argument(
|
||||
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
|
||||
)
|
||||
@@ -157,7 +159,9 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
)
|
||||
unified_parser.add_argument("--config", required=True, help="Unified config JSON file")
|
||||
unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)")
|
||||
unified_parser.add_argument("--fresh", action="store_true", help="Clear existing data and start fresh")
|
||||
unified_parser.add_argument(
|
||||
"--fresh", action="store_true", help="Clear existing data and start fresh"
|
||||
)
|
||||
unified_parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
|
||||
|
||||
# === enhance subcommand ===
|
||||
@@ -343,7 +347,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
# Convert args namespace to sys.argv format for doc_scraper
|
||||
sys.argv = ["doc_scraper.py"]
|
||||
# Add positional URL if provided (positional arg has priority)
|
||||
if hasattr(args, 'url') and args.url:
|
||||
if hasattr(args, "url") and args.url:
|
||||
sys.argv.append(args.url)
|
||||
if args.config:
|
||||
sys.argv.extend(["--config", args.config])
|
||||
@@ -351,7 +355,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
sys.argv.extend(["--name", args.name])
|
||||
if args.description:
|
||||
sys.argv.extend(["--description", args.description])
|
||||
if hasattr(args, 'max_pages') and args.max_pages:
|
||||
if hasattr(args, "max_pages") and args.max_pages:
|
||||
sys.argv.extend(["--max-pages", str(args.max_pages)])
|
||||
if args.skip_scrape:
|
||||
sys.argv.append("--skip-scrape")
|
||||
@@ -548,7 +552,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
|
||||
# Show traceback in verbose mode (if -v flag exists in args)
|
||||
import traceback
|
||||
if hasattr(args, 'verbose') and getattr(args, 'verbose', False):
|
||||
|
||||
if hasattr(args, "verbose") and getattr(args, "verbose", False):
|
||||
traceback.print_exc()
|
||||
|
||||
return 1
|
||||
|
||||
@@ -794,7 +794,12 @@ class PDFExtractor:
|
||||
markdown = page.get_text("markdown")
|
||||
except (AssertionError, ValueError):
|
||||
# Fallback to text format for older/newer PyMuDF versions
|
||||
markdown = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_SPANS)
|
||||
markdown = page.get_text(
|
||||
"text",
|
||||
flags=fitz.TEXT_PRESERVE_WHITESPACE
|
||||
| fitz.TEXT_PRESERVE_LIGATURES
|
||||
| fitz.TEXT_PRESERVE_SPANS,
|
||||
)
|
||||
|
||||
# Extract tables (Priority 2)
|
||||
tables = self.extract_tables_from_page(page)
|
||||
|
||||
@@ -141,7 +141,7 @@ class PDFToSkillConverter:
|
||||
|
||||
categorized[category_key] = {
|
||||
"title": pdf_basename,
|
||||
"pages": self.extracted_data.get("pages", [])
|
||||
"pages": self.extracted_data.get("pages", []),
|
||||
}
|
||||
|
||||
print("✅ Created 1 category (single PDF source)")
|
||||
@@ -176,7 +176,7 @@ class PDFToSkillConverter:
|
||||
if uncategorized_pages:
|
||||
categorized["uncategorized"] = {
|
||||
"title": "Additional Content",
|
||||
"pages": uncategorized_pages
|
||||
"pages": uncategorized_pages,
|
||||
}
|
||||
|
||||
# Fall back to keyword-based categorization
|
||||
@@ -282,7 +282,11 @@ class PDFToSkillConverter:
|
||||
|
||||
# If only one section or section covers most pages, use simple name
|
||||
if total_sections == 1:
|
||||
filename = f"{self.skill_dir}/references/{pdf_basename}.md" if pdf_basename else f"{self.skill_dir}/references/main.md"
|
||||
filename = (
|
||||
f"{self.skill_dir}/references/{pdf_basename}.md"
|
||||
if pdf_basename
|
||||
else f"{self.skill_dir}/references/main.md"
|
||||
)
|
||||
else:
|
||||
# Multiple sections: use PDF basename + page range
|
||||
base_name = pdf_basename if pdf_basename else "section"
|
||||
@@ -376,7 +380,9 @@ class PDFToSkillConverter:
|
||||
link_filename = f"section_{section_num:02d}.md"
|
||||
page_range_str = "N/A"
|
||||
|
||||
f.write(f"- [{cat_data['title']}]({link_filename}) ({page_count} pages, {page_range_str})\n")
|
||||
f.write(
|
||||
f"- [{cat_data['title']}]({link_filename}) ({page_count} pages, {page_range_str})\n"
|
||||
)
|
||||
section_num += 1
|
||||
|
||||
f.write("\n## Statistics\n\n")
|
||||
@@ -693,6 +699,7 @@ def main():
|
||||
except Exception as e:
|
||||
print(f"\n❌ Unexpected error during PDF processing: {e}", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user