feat: remove content truncation in reference files

This commit is contained in:
Edgar I.
2025-10-24 18:15:20 +04:00
parent ac959d3ed5
commit b98457dfb1
2 changed files with 56 additions and 10 deletions

View File

@@ -740,15 +740,12 @@ class DocToSkillConverter:
lines.append(f"{indent}- {h['text']}")
lines.append("")
# Content
# Content (NO TRUNCATION)
if page.get('content'):
content = page['content'][:2500]
if len(page['content']) > 2500:
content += "\n\n*[Content truncated]*"
lines.append(content)
lines.append(page['content'])
lines.append("")
# Code examples with language
# Code examples with language (NO TRUNCATION)
if page.get('code_samples'):
lines.append("**Examples:**\n")
for i, sample in enumerate(page['code_samples'][:4], 1):
@@ -756,9 +753,7 @@ class DocToSkillConverter:
code = sample.get('code', sample if isinstance(sample, str) else '')
lines.append(f"Example {i} ({lang}):")
lines.append(f"```{lang}")
lines.append(code[:600])
if len(code) > 600:
lines.append("...")
lines.append(code) # Full code, no truncation
lines.append("```\n")
lines.append("---\n")

View File

@@ -591,6 +591,57 @@ app.use('*', cors())
shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
shutil.rmtree(f"output/{config['name']}", ignore_errors=True)
def test_no_content_truncation():
"""Test that content is NOT truncated in reference files"""
from unittest.mock import Mock
import tempfile
config = {
'name': 'test-no-truncate',
'base_url': 'https://example.com/docs',
'selectors': {
'main_content': 'article',
'title': 'h1',
'code_blocks': 'pre code'
},
'max_pages': 50
}
# Create scraper with long content
from cli.doc_scraper import DocToSkillConverter
scraper = DocToSkillConverter(config, dry_run=False)
# Create page with content > 2500 chars
long_content = "x" * 5000
long_code = "y" * 1000
pages = [{
'title': 'Long Page',
'url': 'https://example.com/long',
'content': long_content,
'code_samples': [
{'code': long_code, 'language': 'python'}
],
'headings': []
}]
# Create reference file
scraper.create_reference_file('test', pages)
# Verify no truncation
ref_file = Path(f"output/{config['name']}/references/test.md")
with open(ref_file, 'r') as f:
content = f.read()
assert long_content in content # Full content included
assert long_code in content # Full code included
assert '[Content truncated]' not in content
assert '...' not in content or content.count('...') == 0
# Clean up
shutil.rmtree(f"output/{config['name']}_data", ignore_errors=True)
shutil.rmtree(f"output/{config['name']}", ignore_errors=True)
if __name__ == '__main__':
unittest.main()