Merge PR #198: Skip llms.txt Config Option

Merges feat/add-skip-llm-to-config by @sogoiii. This PR adds a valuable configuration option to explicitly skip llms.txt detection, useful when a site's llms.txt is incomplete, incorrect, or when specific HTML scraping is needed. Key features: - New 'skip_llms_txt' config option (default: false, backward compatible) - Boolean type validation with warning for invalid values - Support in both sync and async scraping modes - 17 comprehensive tests (15 feature tests + 2 config validation tests) All tests passing after fixing import paths to use proper package names. Test results: ✅ 17/17 tests passing Full test suite: ✅ 391 tests passing Co-authored-by: sogoiii <sogoiii@users.noreply.github.com>
2025-11-29 22:56:46 +03:00
parent a75b612fb2 8031ce69ce
commit bd20b32470
4 changed files with 356 additions and 4 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -86,6 +86,15 @@ class DocToSkillConverter:
        self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL)

        # llms.txt detection state
+        skip_llms_txt_value = config.get('skip_llms_txt', False)
+        if not isinstance(skip_llms_txt_value, bool):
+            logger.warning(
+                "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.",
+                skip_llms_txt_value
+            )
+            self.skip_llms_txt = False
+        else:
+            self.skip_llms_txt = skip_llms_txt_value
        self.llms_txt_detected = False
        self.llms_txt_variant = None
        self.llms_txt_variants: List[str] = []  # Track all downloaded variants
@@ -618,8 +627,8 @@ class DocToSkillConverter:
            asyncio.run(self.scrape_all_async())
            return

-        # Try llms.txt first (unless dry-run)
-        if not self.dry_run:
+        # Try llms.txt first (unless dry-run or explicitly disabled)
+        if not self.dry_run and not self.skip_llms_txt:
            llms_result = self._try_llms_txt()
            if llms_result:
                logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)
@@ -778,8 +787,8 @@ class DocToSkillConverter:

        Performance: ~2-3x faster than sync mode with same worker count.
        """
-        # Try llms.txt first (unless dry-run)
-        if not self.dry_run:
+        # Try llms.txt first (unless dry-run or explicitly disabled)
+        if not self.dry_run and not self.skip_llms_txt:
            llms_result = self._try_llms_txt()
            if llms_result:
                logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant)