112 lines
3.8 KiB
Python
112 lines
3.8 KiB
Python
"""Tests for dedupe module."""
|
|
|
|
import sys
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
# Add lib to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
|
|
|
|
from lib import dedupe, schema
|
|
|
|
|
|
class TestNormalizeText(unittest.TestCase):
|
|
def test_lowercase(self):
|
|
result = dedupe.normalize_text("HELLO World")
|
|
self.assertEqual(result, "hello world")
|
|
|
|
def test_removes_punctuation(self):
|
|
result = dedupe.normalize_text("Hello, World!")
|
|
# Punctuation replaced with space, then whitespace collapsed
|
|
self.assertEqual(result, "hello world")
|
|
|
|
def test_collapses_whitespace(self):
|
|
result = dedupe.normalize_text("hello world")
|
|
self.assertEqual(result, "hello world")
|
|
|
|
|
|
class TestGetNgrams(unittest.TestCase):
|
|
def test_short_text(self):
|
|
result = dedupe.get_ngrams("ab", n=3)
|
|
self.assertEqual(result, {"ab"})
|
|
|
|
def test_normal_text(self):
|
|
result = dedupe.get_ngrams("hello", n=3)
|
|
self.assertIn("hel", result)
|
|
self.assertIn("ell", result)
|
|
self.assertIn("llo", result)
|
|
|
|
|
|
class TestJaccardSimilarity(unittest.TestCase):
|
|
def test_identical_sets(self):
|
|
set1 = {"a", "b", "c"}
|
|
result = dedupe.jaccard_similarity(set1, set1)
|
|
self.assertEqual(result, 1.0)
|
|
|
|
def test_disjoint_sets(self):
|
|
set1 = {"a", "b", "c"}
|
|
set2 = {"d", "e", "f"}
|
|
result = dedupe.jaccard_similarity(set1, set2)
|
|
self.assertEqual(result, 0.0)
|
|
|
|
def test_partial_overlap(self):
|
|
set1 = {"a", "b", "c"}
|
|
set2 = {"b", "c", "d"}
|
|
result = dedupe.jaccard_similarity(set1, set2)
|
|
self.assertEqual(result, 0.5) # 2 overlap / 4 union
|
|
|
|
def test_empty_sets(self):
|
|
result = dedupe.jaccard_similarity(set(), set())
|
|
self.assertEqual(result, 0.0)
|
|
|
|
|
|
class TestFindDuplicates(unittest.TestCase):
|
|
def test_no_duplicates(self):
|
|
items = [
|
|
schema.RedditItem(id="R1", title="Completely different topic A", url="", subreddit=""),
|
|
schema.RedditItem(id="R2", title="Another unrelated subject B", url="", subreddit=""),
|
|
]
|
|
result = dedupe.find_duplicates(items)
|
|
self.assertEqual(result, [])
|
|
|
|
def test_finds_duplicates(self):
|
|
items = [
|
|
schema.RedditItem(id="R1", title="Best practices for Claude Code skills", url="", subreddit=""),
|
|
schema.RedditItem(id="R2", title="Best practices for Claude Code skills guide", url="", subreddit=""),
|
|
]
|
|
result = dedupe.find_duplicates(items, threshold=0.7)
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0], (0, 1))
|
|
|
|
|
|
class TestDedupeItems(unittest.TestCase):
|
|
def test_keeps_higher_scored(self):
|
|
items = [
|
|
schema.RedditItem(id="R1", title="Best practices for skills", url="", subreddit="", score=90),
|
|
schema.RedditItem(id="R2", title="Best practices for skills guide", url="", subreddit="", score=50),
|
|
]
|
|
result = dedupe.dedupe_items(items, threshold=0.6)
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0].id, "R1")
|
|
|
|
def test_keeps_all_unique(self):
|
|
items = [
|
|
schema.RedditItem(id="R1", title="Topic about apples", url="", subreddit="", score=90),
|
|
schema.RedditItem(id="R2", title="Discussion of oranges", url="", subreddit="", score=50),
|
|
]
|
|
result = dedupe.dedupe_items(items)
|
|
self.assertEqual(len(result), 2)
|
|
|
|
def test_empty_list(self):
|
|
result = dedupe.dedupe_items([])
|
|
self.assertEqual(result, [])
|
|
|
|
def test_single_item(self):
|
|
items = [schema.RedditItem(id="R1", title="Test", url="", subreddit="")]
|
|
result = dedupe.dedupe_items(items)
|
|
self.assertEqual(len(result), 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|