475 lines
17 KiB
Python
475 lines
17 KiB
Python
"""
|
|
Tests for multi-source support in unified scraper and skill builder.
|
|
|
|
Tests the following functionality:
|
|
1. Multiple sources of same type in unified_scraper (list structure)
|
|
2. Source counters and unique naming
|
|
3. Per-source reference directory generation in unified_skill_builder
|
|
4. Multiple documentation sources handling
|
|
5. Multiple GitHub repositories handling
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
|
|
|
|
class TestUnifiedScraperDataStructure(unittest.TestCase):
|
|
"""Test scraped_data list structure in unified_scraper."""
|
|
|
|
def test_scraped_data_uses_list_structure(self):
|
|
"""Test that scraped_data uses list for each source type."""
|
|
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
|
|
|
config = {
|
|
"name": "test_multi",
|
|
"description": "Test skill",
|
|
"sources": [{"type": "documentation", "base_url": "https://example.com"}],
|
|
}
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
original_dir = os.getcwd()
|
|
try:
|
|
os.chdir(temp_dir)
|
|
scraper = UnifiedScraper(config)
|
|
|
|
self.assertIsInstance(scraper.scraped_data["documentation"], list)
|
|
self.assertIsInstance(scraper.scraped_data["github"], list)
|
|
self.assertIsInstance(scraper.scraped_data["pdf"], list)
|
|
finally:
|
|
os.chdir(original_dir)
|
|
|
|
def test_source_counters_initialized_to_zero(self):
|
|
"""Test that source counters start at zero."""
|
|
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
|
|
|
config = {
|
|
"name": "test_counters",
|
|
"description": "Test skill",
|
|
"sources": [{"type": "documentation", "base_url": "https://example.com"}],
|
|
}
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
original_dir = os.getcwd()
|
|
try:
|
|
os.chdir(temp_dir)
|
|
scraper = UnifiedScraper(config)
|
|
|
|
self.assertEqual(scraper._source_counters["documentation"], 0)
|
|
self.assertEqual(scraper._source_counters["github"], 0)
|
|
self.assertEqual(scraper._source_counters["pdf"], 0)
|
|
finally:
|
|
os.chdir(original_dir)
|
|
|
|
def test_empty_lists_initially(self):
|
|
"""Test that source lists are empty initially."""
|
|
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
|
|
|
config = {
|
|
"name": "test_empty",
|
|
"description": "Test skill",
|
|
"sources": [{"type": "documentation", "base_url": "https://example.com"}],
|
|
}
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
original_dir = os.getcwd()
|
|
try:
|
|
os.chdir(temp_dir)
|
|
scraper = UnifiedScraper(config)
|
|
|
|
self.assertEqual(len(scraper.scraped_data["documentation"]), 0)
|
|
self.assertEqual(len(scraper.scraped_data["github"]), 0)
|
|
self.assertEqual(len(scraper.scraped_data["pdf"]), 0)
|
|
finally:
|
|
os.chdir(original_dir)
|
|
|
|
|
|
class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase):
|
|
"""Test documentation reference generation for multiple sources."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.original_dir = os.getcwd()
|
|
os.chdir(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
os.chdir(self.original_dir)
|
|
if os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_creates_subdirectory_per_source(self):
|
|
"""Test that each doc source gets its own subdirectory."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
# Create mock refs directories
|
|
refs_dir1 = os.path.join(self.temp_dir, "refs1")
|
|
refs_dir2 = os.path.join(self.temp_dir, "refs2")
|
|
os.makedirs(refs_dir1)
|
|
os.makedirs(refs_dir2)
|
|
|
|
config = {"name": "test_docs_refs", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [
|
|
{
|
|
"source_id": "source_a",
|
|
"base_url": "https://a.com",
|
|
"total_pages": 5,
|
|
"refs_dir": refs_dir1,
|
|
},
|
|
{
|
|
"source_id": "source_b",
|
|
"base_url": "https://b.com",
|
|
"total_pages": 3,
|
|
"refs_dir": refs_dir2,
|
|
},
|
|
],
|
|
"github": [],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_docs_references(scraped_data["documentation"])
|
|
|
|
docs_dir = os.path.join(builder.skill_dir, "references", "documentation")
|
|
self.assertTrue(os.path.exists(os.path.join(docs_dir, "source_a")))
|
|
self.assertTrue(os.path.exists(os.path.join(docs_dir, "source_b")))
|
|
|
|
def test_creates_index_per_source(self):
|
|
"""Test that each source subdirectory has its own index.md."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
refs_dir = os.path.join(self.temp_dir, "refs")
|
|
os.makedirs(refs_dir)
|
|
|
|
config = {"name": "test_source_index", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [
|
|
{
|
|
"source_id": "my_source",
|
|
"base_url": "https://example.com",
|
|
"total_pages": 10,
|
|
"refs_dir": refs_dir,
|
|
}
|
|
],
|
|
"github": [],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_docs_references(scraped_data["documentation"])
|
|
|
|
source_index = os.path.join(
|
|
builder.skill_dir, "references", "documentation", "my_source", "index.md"
|
|
)
|
|
self.assertTrue(os.path.exists(source_index))
|
|
|
|
with open(source_index) as f:
|
|
content = f.read()
|
|
self.assertIn("my_source", content)
|
|
self.assertIn("https://example.com", content)
|
|
|
|
def test_creates_main_index_listing_all_sources(self):
|
|
"""Test that main index.md lists all documentation sources."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
refs_dir1 = os.path.join(self.temp_dir, "refs1")
|
|
refs_dir2 = os.path.join(self.temp_dir, "refs2")
|
|
os.makedirs(refs_dir1)
|
|
os.makedirs(refs_dir2)
|
|
|
|
config = {"name": "test_main_index", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [
|
|
{
|
|
"source_id": "docs_one",
|
|
"base_url": "https://one.com",
|
|
"total_pages": 10,
|
|
"refs_dir": refs_dir1,
|
|
},
|
|
{
|
|
"source_id": "docs_two",
|
|
"base_url": "https://two.com",
|
|
"total_pages": 20,
|
|
"refs_dir": refs_dir2,
|
|
},
|
|
],
|
|
"github": [],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_docs_references(scraped_data["documentation"])
|
|
|
|
main_index = os.path.join(builder.skill_dir, "references", "documentation", "index.md")
|
|
self.assertTrue(os.path.exists(main_index))
|
|
|
|
with open(main_index) as f:
|
|
content = f.read()
|
|
self.assertIn("docs_one", content)
|
|
self.assertIn("docs_two", content)
|
|
self.assertIn("2 documentation sources", content)
|
|
|
|
def test_copies_reference_files_to_source_dir(self):
|
|
"""Test that reference files are copied to source subdirectory."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
refs_dir = os.path.join(self.temp_dir, "refs")
|
|
os.makedirs(refs_dir)
|
|
|
|
# Create mock reference files
|
|
with open(os.path.join(refs_dir, "api.md"), "w") as f:
|
|
f.write("# API Reference")
|
|
with open(os.path.join(refs_dir, "guide.md"), "w") as f:
|
|
f.write("# User Guide")
|
|
|
|
config = {"name": "test_copy_refs", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [
|
|
{
|
|
"source_id": "test_source",
|
|
"base_url": "https://test.com",
|
|
"total_pages": 5,
|
|
"refs_dir": refs_dir,
|
|
}
|
|
],
|
|
"github": [],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_docs_references(scraped_data["documentation"])
|
|
|
|
source_dir = os.path.join(builder.skill_dir, "references", "documentation", "test_source")
|
|
self.assertTrue(os.path.exists(os.path.join(source_dir, "api.md")))
|
|
self.assertTrue(os.path.exists(os.path.join(source_dir, "guide.md")))
|
|
|
|
|
|
class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase):
|
|
"""Test GitHub reference generation for multiple repositories."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.original_dir = os.getcwd()
|
|
os.chdir(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
os.chdir(self.original_dir)
|
|
if os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_creates_subdirectory_per_repo(self):
|
|
"""Test that each GitHub repo gets its own subdirectory."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
config = {"name": "test_github_refs", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [],
|
|
"github": [
|
|
{
|
|
"repo": "org/repo1",
|
|
"repo_id": "org_repo1",
|
|
"data": {"readme": "# Repo 1", "issues": [], "releases": [], "repo_info": {}},
|
|
},
|
|
{
|
|
"repo": "org/repo2",
|
|
"repo_id": "org_repo2",
|
|
"data": {"readme": "# Repo 2", "issues": [], "releases": [], "repo_info": {}},
|
|
},
|
|
],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_github_references(scraped_data["github"])
|
|
|
|
github_dir = os.path.join(builder.skill_dir, "references", "github")
|
|
self.assertTrue(os.path.exists(os.path.join(github_dir, "org_repo1")))
|
|
self.assertTrue(os.path.exists(os.path.join(github_dir, "org_repo2")))
|
|
|
|
def test_creates_readme_per_repo(self):
|
|
"""Test that README.md is created for each repo."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
config = {"name": "test_readme", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [],
|
|
"github": [
|
|
{
|
|
"repo": "test/myrepo",
|
|
"repo_id": "test_myrepo",
|
|
"data": {
|
|
"readme": "# My Repository\n\nDescription here.",
|
|
"issues": [],
|
|
"releases": [],
|
|
"repo_info": {},
|
|
},
|
|
}
|
|
],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_github_references(scraped_data["github"])
|
|
|
|
readme_path = os.path.join(
|
|
builder.skill_dir, "references", "github", "test_myrepo", "README.md"
|
|
)
|
|
self.assertTrue(os.path.exists(readme_path))
|
|
|
|
with open(readme_path) as f:
|
|
content = f.read()
|
|
self.assertIn("test/myrepo", content)
|
|
|
|
def test_creates_issues_file_when_issues_exist(self):
|
|
"""Test that issues.md is created when repo has issues."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
config = {"name": "test_issues", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [],
|
|
"github": [
|
|
{
|
|
"repo": "test/repo",
|
|
"repo_id": "test_repo",
|
|
"data": {
|
|
"readme": "# Repo",
|
|
"issues": [
|
|
{
|
|
"number": 1,
|
|
"title": "Bug report",
|
|
"state": "open",
|
|
"labels": ["bug"],
|
|
"url": "https://github.com/test/repo/issues/1",
|
|
},
|
|
{
|
|
"number": 2,
|
|
"title": "Feature request",
|
|
"state": "closed",
|
|
"labels": ["enhancement"],
|
|
"url": "https://github.com/test/repo/issues/2",
|
|
},
|
|
],
|
|
"releases": [],
|
|
"repo_info": {},
|
|
},
|
|
}
|
|
],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_github_references(scraped_data["github"])
|
|
|
|
issues_path = os.path.join(
|
|
builder.skill_dir, "references", "github", "test_repo", "issues.md"
|
|
)
|
|
self.assertTrue(os.path.exists(issues_path))
|
|
|
|
with open(issues_path) as f:
|
|
content = f.read()
|
|
self.assertIn("Bug report", content)
|
|
self.assertIn("Feature request", content)
|
|
|
|
def test_creates_main_index_listing_all_repos(self):
|
|
"""Test that main index.md lists all GitHub repositories."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
config = {"name": "test_github_index", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [],
|
|
"github": [
|
|
{
|
|
"repo": "org/first",
|
|
"repo_id": "org_first",
|
|
"data": {
|
|
"readme": "#",
|
|
"issues": [],
|
|
"releases": [],
|
|
"repo_info": {"stars": 100},
|
|
},
|
|
},
|
|
{
|
|
"repo": "org/second",
|
|
"repo_id": "org_second",
|
|
"data": {
|
|
"readme": "#",
|
|
"issues": [],
|
|
"releases": [],
|
|
"repo_info": {"stars": 50},
|
|
},
|
|
},
|
|
],
|
|
"pdf": [],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_github_references(scraped_data["github"])
|
|
|
|
main_index = os.path.join(builder.skill_dir, "references", "github", "index.md")
|
|
self.assertTrue(os.path.exists(main_index))
|
|
|
|
with open(main_index) as f:
|
|
content = f.read()
|
|
self.assertIn("org/first", content)
|
|
self.assertIn("org/second", content)
|
|
self.assertIn("2 GitHub repositories", content)
|
|
|
|
|
|
class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase):
|
|
"""Test PDF reference generation for multiple sources."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.original_dir = os.getcwd()
|
|
os.chdir(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
os.chdir(self.original_dir)
|
|
if os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_creates_pdf_index_with_count(self):
|
|
"""Test that PDF index shows correct document count."""
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
|
|
config = {"name": "test_pdf", "description": "Test", "sources": []}
|
|
|
|
scraped_data = {
|
|
"documentation": [],
|
|
"github": [],
|
|
"pdf": [
|
|
{"path": "/path/to/doc1.pdf"},
|
|
{"path": "/path/to/doc2.pdf"},
|
|
{"path": "/path/to/doc3.pdf"},
|
|
],
|
|
}
|
|
|
|
builder = UnifiedSkillBuilder(config, scraped_data)
|
|
builder._generate_pdf_references(scraped_data["pdf"])
|
|
|
|
pdf_index = os.path.join(builder.skill_dir, "references", "pdf", "index.md")
|
|
self.assertTrue(os.path.exists(pdf_index))
|
|
|
|
with open(pdf_index) as f:
|
|
content = f.read()
|
|
self.assertIn("3 PDF document", content)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|