121 lines
3.2 KiB
Python
121 lines
3.2 KiB
Python
"""Near-duplicate detection for last30days skill."""
|
|
|
|
import re
|
|
from typing import List, Set, Tuple, Union
|
|
|
|
from . import schema
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
"""Normalize text for comparison.
|
|
|
|
- Lowercase
|
|
- Remove punctuation
|
|
- Collapse whitespace
|
|
"""
|
|
text = text.lower()
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
text = re.sub(r'\s+', ' ', text)
|
|
return text.strip()
|
|
|
|
|
|
def get_ngrams(text: str, n: int = 3) -> Set[str]:
|
|
"""Get character n-grams from text."""
|
|
text = normalize_text(text)
|
|
if len(text) < n:
|
|
return {text}
|
|
return {text[i:i+n] for i in range(len(text) - n + 1)}
|
|
|
|
|
|
def jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
|
|
"""Compute Jaccard similarity between two sets."""
|
|
if not set1 or not set2:
|
|
return 0.0
|
|
intersection = len(set1 & set2)
|
|
union = len(set1 | set2)
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
|
|
def get_item_text(item: Union[schema.RedditItem, schema.XItem]) -> str:
|
|
"""Get comparable text from an item."""
|
|
if isinstance(item, schema.RedditItem):
|
|
return item.title
|
|
else:
|
|
return item.text
|
|
|
|
|
|
def find_duplicates(
|
|
items: List[Union[schema.RedditItem, schema.XItem]],
|
|
threshold: float = 0.7,
|
|
) -> List[Tuple[int, int]]:
|
|
"""Find near-duplicate pairs in items.
|
|
|
|
Args:
|
|
items: List of items to check
|
|
threshold: Similarity threshold (0-1)
|
|
|
|
Returns:
|
|
List of (i, j) index pairs where i < j and items are similar
|
|
"""
|
|
duplicates = []
|
|
|
|
# Pre-compute n-grams
|
|
ngrams = [get_ngrams(get_item_text(item)) for item in items]
|
|
|
|
for i in range(len(items)):
|
|
for j in range(i + 1, len(items)):
|
|
similarity = jaccard_similarity(ngrams[i], ngrams[j])
|
|
if similarity >= threshold:
|
|
duplicates.append((i, j))
|
|
|
|
return duplicates
|
|
|
|
|
|
def dedupe_items(
|
|
items: List[Union[schema.RedditItem, schema.XItem]],
|
|
threshold: float = 0.7,
|
|
) -> List[Union[schema.RedditItem, schema.XItem]]:
|
|
"""Remove near-duplicates, keeping highest-scored item.
|
|
|
|
Args:
|
|
items: List of items (should be pre-sorted by score descending)
|
|
threshold: Similarity threshold
|
|
|
|
Returns:
|
|
Deduplicated items
|
|
"""
|
|
if len(items) <= 1:
|
|
return items
|
|
|
|
# Find duplicate pairs
|
|
dup_pairs = find_duplicates(items, threshold)
|
|
|
|
# Mark indices to remove (always remove the lower-scored one)
|
|
# Since items are pre-sorted by score, the second index is always lower
|
|
to_remove = set()
|
|
for i, j in dup_pairs:
|
|
# Keep the higher-scored one (lower index in sorted list)
|
|
if items[i].score >= items[j].score:
|
|
to_remove.add(j)
|
|
else:
|
|
to_remove.add(i)
|
|
|
|
# Return items not marked for removal
|
|
return [item for idx, item in enumerate(items) if idx not in to_remove]
|
|
|
|
|
|
def dedupe_reddit(
|
|
items: List[schema.RedditItem],
|
|
threshold: float = 0.7,
|
|
) -> List[schema.RedditItem]:
|
|
"""Dedupe Reddit items."""
|
|
return dedupe_items(items, threshold)
|
|
|
|
|
|
def dedupe_x(
|
|
items: List[schema.XItem],
|
|
threshold: float = 0.7,
|
|
) -> List[schema.XItem]:
|
|
"""Dedupe X items."""
|
|
return dedupe_items(items, threshold)
|