feat: remove content truncation in reference files

2025-10-24 18:15:20 +04:00
parent ac959d3ed5
commit b98457dfb1
2 changed files with 56 additions and 10 deletions
--- a/cli/doc_scraper.py
+++ b/cli/doc_scraper.py
@@ -740,15 +740,12 @@ class DocToSkillConverter:
                    lines.append(f"{indent}- {h['text']}")
                lines.append("")
            
-            # Content
+            # Content (NO TRUNCATION)
            if page.get('content'):
-                content = page['content'][:2500]
-                if len(page['content']) > 2500:
-                    content += "\n\n*[Content truncated]*"
-                lines.append(content)
+                lines.append(page['content'])
                lines.append("")
-            
-            # Code examples with language
+
+            # Code examples with language (NO TRUNCATION)
            if page.get('code_samples'):
                lines.append("**Examples:**\n")
                for i, sample in enumerate(page['code_samples'][:4], 1):
@@ -756,9 +753,7 @@ class DocToSkillConverter:
                    code = sample.get('code', sample if isinstance(sample, str) else '')
                    lines.append(f"Example {i} ({lang}):")
                    lines.append(f"```{lang}")
-                    lines.append(code[:600])
-                    if len(code) > 600:
-                        lines.append("...")
+                    lines.append(code)  # Full code, no truncation
                    lines.append("```\n")
            
            lines.append("---\n")
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -591,6 +591,57 @@ app.use('*', cors())
        shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
        shutil.rmtree(f"output/{config['name']}", ignore_errors=True)

+def test_no_content_truncation():
+    """Test that content is NOT truncated in reference files"""
+    from unittest.mock import Mock
+    import tempfile
+
+    config = {
+        'name': 'test-no-truncate',
+        'base_url': 'https://example.com/docs',
+        'selectors': {
+            'main_content': 'article',
+            'title': 'h1',
+            'code_blocks': 'pre code'
+        },
+        'max_pages': 50
+    }
+
+    # Create scraper with long content
+    from cli.doc_scraper import DocToSkillConverter
+    scraper = DocToSkillConverter(config, dry_run=False)
+
+    # Create page with content > 2500 chars
+    long_content = "x" * 5000
+    long_code = "y" * 1000
+
+    pages = [{
+        'title': 'Long Page',
+        'url': 'https://example.com/long',
+        'content': long_content,
+        'code_samples': [
+            {'code': long_code, 'language': 'python'}
+        ],
+        'headings': []
+    }]
+
+    # Create reference file
+    scraper.create_reference_file('test', pages)
+
+    # Verify no truncation
+    ref_file = Path(f"output/{config['name']}/references/test.md")
+    with open(ref_file, 'r') as f:
+        content = f.read()
+
+    assert long_content in content  # Full content included
+    assert long_code in content     # Full code included
+    assert '[Content truncated]' not in content
+    assert '...' not in content or content.count('...') == 0
+
+    # Clean up
+    shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
+    shutil.rmtree(f"output/{config['name']}", ignore_errors=True)
+

 if __name__ == '__main__':
    unittest.main()