- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
165 lines
5.6 KiB
Python
165 lines
5.6 KiB
Python
"""
|
|
Pydantic models for sync system.
|
|
"""
|
|
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class ChangeType(str, Enum):
|
|
"""Type of change detected."""
|
|
ADDED = "added"
|
|
MODIFIED = "modified"
|
|
DELETED = "deleted"
|
|
UNCHANGED = "unchanged"
|
|
|
|
|
|
class PageChange(BaseModel):
|
|
"""Represents a change to a single page."""
|
|
|
|
url: str = Field(..., description="Page URL")
|
|
change_type: ChangeType = Field(..., description="Type of change")
|
|
old_hash: Optional[str] = Field(None, description="Previous content hash")
|
|
new_hash: Optional[str] = Field(None, description="New content hash")
|
|
diff: Optional[str] = Field(None, description="Content diff (if available)")
|
|
detected_at: datetime = Field(
|
|
default_factory=datetime.utcnow,
|
|
description="When change was detected"
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra = {
|
|
"example": {
|
|
"url": "https://react.dev/learn/thinking-in-react",
|
|
"change_type": "modified",
|
|
"old_hash": "abc123",
|
|
"new_hash": "def456",
|
|
"diff": "@@ -10,3 +10,4 @@\n+New content here",
|
|
"detected_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
}
|
|
|
|
|
|
class ChangeReport(BaseModel):
|
|
"""Report of all changes detected."""
|
|
|
|
skill_name: str = Field(..., description="Skill name")
|
|
total_pages: int = Field(..., description="Total pages checked")
|
|
added: List[PageChange] = Field(default_factory=list, description="Added pages")
|
|
modified: List[PageChange] = Field(default_factory=list, description="Modified pages")
|
|
deleted: List[PageChange] = Field(default_factory=list, description="Deleted pages")
|
|
unchanged: int = Field(0, description="Number of unchanged pages")
|
|
checked_at: datetime = Field(
|
|
default_factory=datetime.utcnow,
|
|
description="When check was performed"
|
|
)
|
|
|
|
@property
|
|
def has_changes(self) -> bool:
|
|
"""Check if any changes were detected."""
|
|
return bool(self.added or self.modified or self.deleted)
|
|
|
|
@property
|
|
def change_count(self) -> int:
|
|
"""Total number of changes."""
|
|
return len(self.added) + len(self.modified) + len(self.deleted)
|
|
|
|
|
|
class SyncConfig(BaseModel):
|
|
"""Configuration for sync monitoring."""
|
|
|
|
skill_config: str = Field(..., description="Path to skill config file")
|
|
check_interval: int = Field(
|
|
default=3600,
|
|
description="Check interval in seconds (default: 1 hour)"
|
|
)
|
|
enabled: bool = Field(default=True, description="Whether sync is enabled")
|
|
auto_update: bool = Field(
|
|
default=False,
|
|
description="Automatically rebuild skill on changes"
|
|
)
|
|
notify_on_change: bool = Field(
|
|
default=True,
|
|
description="Send notifications on changes"
|
|
)
|
|
notification_channels: List[str] = Field(
|
|
default_factory=list,
|
|
description="Notification channels (email, slack, webhook)"
|
|
)
|
|
webhook_url: Optional[str] = Field(
|
|
None,
|
|
description="Webhook URL for change notifications"
|
|
)
|
|
email_recipients: List[str] = Field(
|
|
default_factory=list,
|
|
description="Email recipients for notifications"
|
|
)
|
|
slack_webhook: Optional[str] = Field(
|
|
None,
|
|
description="Slack webhook URL"
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra = {
|
|
"example": {
|
|
"skill_config": "configs/react.json",
|
|
"check_interval": 3600,
|
|
"enabled": True,
|
|
"auto_update": False,
|
|
"notify_on_change": True,
|
|
"notification_channels": ["slack", "webhook"],
|
|
"webhook_url": "https://example.com/webhook",
|
|
"slack_webhook": "https://hooks.slack.com/services/..."
|
|
}
|
|
}
|
|
|
|
|
|
class SyncState(BaseModel):
|
|
"""Current state of sync monitoring."""
|
|
|
|
skill_name: str = Field(..., description="Skill name")
|
|
last_check: Optional[datetime] = Field(None, description="Last check time")
|
|
last_change: Optional[datetime] = Field(None, description="Last change detected")
|
|
total_checks: int = Field(default=0, description="Total checks performed")
|
|
total_changes: int = Field(default=0, description="Total changes detected")
|
|
page_hashes: Dict[str, str] = Field(
|
|
default_factory=dict,
|
|
description="URL -> content hash mapping"
|
|
)
|
|
status: str = Field(default="idle", description="Current status")
|
|
error: Optional[str] = Field(None, description="Last error message")
|
|
|
|
|
|
class WebhookPayload(BaseModel):
|
|
"""Payload for webhook notifications."""
|
|
|
|
event: str = Field(..., description="Event type (change_detected, sync_complete)")
|
|
skill_name: str = Field(..., description="Skill name")
|
|
timestamp: datetime = Field(
|
|
default_factory=datetime.utcnow,
|
|
description="Event timestamp"
|
|
)
|
|
changes: Optional[ChangeReport] = Field(None, description="Change report")
|
|
metadata: Dict[str, Any] = Field(
|
|
default_factory=dict,
|
|
description="Additional metadata"
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra = {
|
|
"example": {
|
|
"event": "change_detected",
|
|
"skill_name": "react",
|
|
"timestamp": "2024-01-15T10:30:00Z",
|
|
"changes": {
|
|
"total_pages": 150,
|
|
"added": [],
|
|
"modified": [{"url": "https://react.dev/learn"}],
|
|
"deleted": []
|
|
},
|
|
"metadata": {"source": "periodic_check"}
|
|
}
|
|
}
|