This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -13,9 +13,9 @@ Used by unified scraper to identify discrepancies before merging.
import json
import logging
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from dataclasses import asdict, dataclass
from difflib import SequenceMatcher
from typing import Any
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -24,13 +24,14 @@ logger = logging.getLogger(__name__)
@dataclass
class Conflict:
"""Represents a conflict between documentation and code."""
type: str # 'missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch'
severity: str # 'low', 'medium', 'high'
api_name: str
docs_info: Optional[Dict[str, Any]] = None
code_info: Optional[Dict[str, Any]] = None
difference: Optional[str] = None
suggestion: Optional[str] = None
docs_info: dict[str, Any] | None = None
code_info: dict[str, Any] | None = None
difference: str | None = None
suggestion: str | None = None
class ConflictDetector:
@@ -38,7 +39,7 @@ class ConflictDetector:
Detects conflicts between documentation and code sources.
"""
def __init__(self, docs_data: Dict[str, Any], github_data: Dict[str, Any]):
def __init__(self, docs_data: dict[str, Any], github_data: dict[str, Any]):
"""
Initialize conflict detector.
@@ -56,7 +57,7 @@ class ConflictDetector:
logger.info(f"Loaded {len(self.docs_apis)} APIs from documentation")
logger.info(f"Loaded {len(self.code_apis)} APIs from code")
def _extract_docs_apis(self) -> Dict[str, Dict[str, Any]]:
def _extract_docs_apis(self) -> dict[str, dict[str, Any]]:
"""
Extract API information from documentation data.
@@ -66,42 +67,43 @@ class ConflictDetector:
apis = {}
# Documentation structure varies, but typically has 'pages' or 'references'
pages = self.docs_data.get('pages', {})
pages = self.docs_data.get("pages", {})
# Handle both dict and list formats
if isinstance(pages, dict):
# Format: {url: page_data, ...}
for url, page_data in pages.items():
content = page_data.get('content', '')
title = page_data.get('title', '')
content = page_data.get("content", "")
title = page_data.get("title", "")
# Simple heuristic: if title or URL contains "api", "reference", "class", "function"
# it might be an API page
if any(keyword in title.lower() or keyword in url.lower()
for keyword in ['api', 'reference', 'class', 'function', 'method']):
if any(
keyword in title.lower() or keyword in url.lower()
for keyword in ["api", "reference", "class", "function", "method"]
):
# Extract API signatures from content (simplified)
extracted_apis = self._parse_doc_content_for_apis(content, url)
apis.update(extracted_apis)
elif isinstance(pages, list):
# Format: [{url: '...', apis: [...]}, ...]
for page in pages:
url = page.get('url', '')
page_apis = page.get('apis', [])
url = page.get("url", "")
page_apis = page.get("apis", [])
# If APIs are already extracted in the page data
for api in page_apis:
api_name = api.get('name', '')
api_name = api.get("name", "")
if api_name:
apis[api_name] = {
'parameters': api.get('parameters', []),
'return_type': api.get('return_type', 'Any'),
'source_url': url
"parameters": api.get("parameters", []),
"return_type": api.get("return_type", "Any"),
"source_url": url,
}
return apis
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> Dict[str, Dict]:
def _parse_doc_content_for_apis(self, content: str, source_url: str) -> dict[str, dict]:
"""
Parse documentation content to extract API signatures.
@@ -121,13 +123,13 @@ class ConflictDetector:
# Pattern for common API signatures
patterns = [
# Python style: def name(params) -> return
r'def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?',
r"def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?",
# JavaScript style: function name(params)
r'function\s+(\w+)\s*\(([^)]*)\)',
r"function\s+(\w+)\s*\(([^)]*)\)",
# C++ style: return_type name(params)
r'(\w+)\s+(\w+)\s*\(([^)]*)\)',
r"(\w+)\s+(\w+)\s*\(([^)]*)\)",
# Method style: ClassName.method_name(params)
r'(\w+)\.(\w+)\s*\(([^)]*)\)'
r"(\w+)\.(\w+)\s*\(([^)]*)\)",
]
for pattern in patterns:
@@ -135,17 +137,17 @@ class ConflictDetector:
groups = match.groups()
# Parse based on pattern matched
if 'def' in pattern:
if "def" in pattern:
# Python function
name = groups[0]
params_str = groups[1]
return_type = groups[2] if len(groups) > 2 else None
elif 'function' in pattern:
elif "function" in pattern:
# JavaScript function
name = groups[0]
params_str = groups[1]
return_type = None
elif '.' in pattern:
elif "." in pattern:
# Class method
class_name = groups[0]
method_name = groups[1]
@@ -162,54 +164,54 @@ class ConflictDetector:
params = self._parse_param_string(params_str)
apis[name] = {
'name': name,
'parameters': params,
'return_type': return_type,
'source': source_url,
'raw_signature': match.group(0)
"name": name,
"parameters": params,
"return_type": return_type,
"source": source_url,
"raw_signature": match.group(0),
}
return apis
def _parse_param_string(self, params_str: str) -> List[Dict]:
def _parse_param_string(self, params_str: str) -> list[dict]:
"""Parse parameter string into list of parameter dicts."""
if not params_str.strip():
return []
params = []
for param in params_str.split(','):
for param in params_str.split(","):
param = param.strip()
if not param:
continue
# Try to extract name and type
param_info = {'name': param, 'type': None, 'default': None}
param_info = {"name": param, "type": None, "default": None}
# Check for type annotation (: type)
if ':' in param:
parts = param.split(':', 1)
param_info['name'] = parts[0].strip()
if ":" in param:
parts = param.split(":", 1)
param_info["name"] = parts[0].strip()
type_part = parts[1].strip()
# Check for default value (= value)
if '=' in type_part:
type_str, default_str = type_part.split('=', 1)
param_info['type'] = type_str.strip()
param_info['default'] = default_str.strip()
if "=" in type_part:
type_str, default_str = type_part.split("=", 1)
param_info["type"] = type_str.strip()
param_info["default"] = default_str.strip()
else:
param_info['type'] = type_part
param_info["type"] = type_part
# Check for default without type (= value)
elif '=' in param:
parts = param.split('=', 1)
param_info['name'] = parts[0].strip()
param_info['default'] = parts[1].strip()
elif "=" in param:
parts = param.split("=", 1)
param_info["name"] = parts[0].strip()
param_info["default"] = parts[1].strip()
params.append(param_info)
return params
def _extract_code_apis(self) -> Dict[str, Dict[str, Any]]:
def _extract_code_apis(self) -> dict[str, dict[str, Any]]:
"""
Extract API information from GitHub code analysis.
@@ -218,61 +220,61 @@ class ConflictDetector:
"""
apis = {}
code_analysis = self.github_data.get('code_analysis', {})
code_analysis = self.github_data.get("code_analysis", {})
if not code_analysis:
return apis
# Support both 'files' and 'analyzed_files' keys
files = code_analysis.get('files', code_analysis.get('analyzed_files', []))
files = code_analysis.get("files", code_analysis.get("analyzed_files", []))
for file_info in files:
file_path = file_info.get('file', 'unknown')
file_path = file_info.get("file", "unknown")
# Extract classes and their methods
for class_info in file_info.get('classes', []):
class_name = class_info['name']
for class_info in file_info.get("classes", []):
class_name = class_info["name"]
# Add class itself
apis[class_name] = {
'name': class_name,
'type': 'class',
'source': file_path,
'line': class_info.get('line_number'),
'base_classes': class_info.get('base_classes', []),
'docstring': class_info.get('docstring')
"name": class_name,
"type": "class",
"source": file_path,
"line": class_info.get("line_number"),
"base_classes": class_info.get("base_classes", []),
"docstring": class_info.get("docstring"),
}
# Add methods
for method in class_info.get('methods', []):
for method in class_info.get("methods", []):
method_name = f"{class_name}.{method['name']}"
apis[method_name] = {
'name': method_name,
'type': 'method',
'parameters': method.get('parameters', []),
'return_type': method.get('return_type'),
'source': file_path,
'line': method.get('line_number'),
'docstring': method.get('docstring'),
'is_async': method.get('is_async', False)
"name": method_name,
"type": "method",
"parameters": method.get("parameters", []),
"return_type": method.get("return_type"),
"source": file_path,
"line": method.get("line_number"),
"docstring": method.get("docstring"),
"is_async": method.get("is_async", False),
}
# Extract standalone functions
for func_info in file_info.get('functions', []):
func_name = func_info['name']
for func_info in file_info.get("functions", []):
func_name = func_info["name"]
apis[func_name] = {
'name': func_name,
'type': 'function',
'parameters': func_info.get('parameters', []),
'return_type': func_info.get('return_type'),
'source': file_path,
'line': func_info.get('line_number'),
'docstring': func_info.get('docstring'),
'is_async': func_info.get('is_async', False)
"name": func_name,
"type": "function",
"parameters": func_info.get("parameters", []),
"return_type": func_info.get("return_type"),
"source": file_path,
"line": func_info.get("line_number"),
"docstring": func_info.get("docstring"),
"is_async": func_info.get("is_async", False),
}
return apis
def detect_all_conflicts(self) -> List[Conflict]:
def detect_all_conflicts(self) -> list[Conflict]:
"""
Detect all types of conflicts.
@@ -296,7 +298,7 @@ class ConflictDetector:
return conflicts
def _find_missing_in_docs(self) -> List[Conflict]:
def _find_missing_in_docs(self) -> list[Conflict]:
"""Find APIs that exist in code but not in documentation."""
conflicts = []
@@ -304,40 +306,46 @@ class ConflictDetector:
# Simple name matching (can be enhanced with fuzzy matching)
if api_name not in self.docs_apis:
# Check if it's a private/internal API (often not documented)
is_private = api_name.startswith('_') or '__' in api_name
severity = 'low' if is_private else 'medium'
is_private = api_name.startswith("_") or "__" in api_name
severity = "low" if is_private else "medium"
conflicts.append(Conflict(
type='missing_in_docs',
severity=severity,
api_name=api_name,
code_info=code_info,
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
suggestion="Add documentation for this API" if not is_private else "Consider if this internal API should be documented"
))
conflicts.append(
Conflict(
type="missing_in_docs",
severity=severity,
api_name=api_name,
code_info=code_info,
difference=f"API exists in code ({code_info['source']}) but not found in documentation",
suggestion="Add documentation for this API"
if not is_private
else "Consider if this internal API should be documented",
)
)
logger.info(f"Found {len(conflicts)} APIs missing in documentation")
return conflicts
def _find_missing_in_code(self) -> List[Conflict]:
def _find_missing_in_code(self) -> list[Conflict]:
"""Find APIs that are documented but don't exist in code."""
conflicts = []
for api_name, docs_info in self.docs_apis.items():
if api_name not in self.code_apis:
conflicts.append(Conflict(
type='missing_in_code',
severity='high', # This is serious - documented but doesn't exist
api_name=api_name,
docs_info=docs_info,
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
suggestion="Update documentation to remove this API, or add it to codebase"
))
conflicts.append(
Conflict(
type="missing_in_code",
severity="high", # This is serious - documented but doesn't exist
api_name=api_name,
docs_info=docs_info,
difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code",
suggestion="Update documentation to remove this API, or add it to codebase",
)
)
logger.info(f"Found {len(conflicts)} APIs missing in code")
return conflicts
def _find_signature_mismatches(self) -> List[Conflict]:
def _find_signature_mismatches(self) -> list[Conflict]:
"""Find APIs where signature differs between docs and code."""
conflicts = []
@@ -352,41 +360,43 @@ class ConflictDetector:
mismatch = self._compare_signatures(docs_info, code_info)
if mismatch:
conflicts.append(Conflict(
type='signature_mismatch',
severity=mismatch['severity'],
api_name=api_name,
docs_info=docs_info,
code_info=code_info,
difference=mismatch['difference'],
suggestion=mismatch['suggestion']
))
conflicts.append(
Conflict(
type="signature_mismatch",
severity=mismatch["severity"],
api_name=api_name,
docs_info=docs_info,
code_info=code_info,
difference=mismatch["difference"],
suggestion=mismatch["suggestion"],
)
)
logger.info(f"Found {len(conflicts)} signature mismatches")
return conflicts
def _compare_signatures(self, docs_info: Dict, code_info: Dict) -> Optional[Dict]:
def _compare_signatures(self, docs_info: dict, code_info: dict) -> dict | None:
"""
Compare signatures between docs and code.
Returns:
Dict with mismatch details if conflict found, None otherwise
"""
docs_params = docs_info.get('parameters', [])
code_params = code_info.get('parameters', [])
docs_params = docs_info.get("parameters", [])
code_params = code_info.get("parameters", [])
# Compare parameter counts
if len(docs_params) != len(code_params):
return {
'severity': 'medium',
'difference': f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
'suggestion': f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}"
"severity": "medium",
"difference": f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}",
"suggestion": f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}",
}
# Compare parameter names and types
for i, (doc_param, code_param) in enumerate(zip(docs_params, code_params)):
doc_name = doc_param.get('name', '')
code_name = code_param.get('name', '')
doc_name = doc_param.get("name", "")
code_name = code_param.get("name", "")
# Parameter name mismatch
if doc_name != code_name:
@@ -394,36 +404,36 @@ class ConflictDetector:
similarity = SequenceMatcher(None, doc_name, code_name).ratio()
if similarity < 0.8: # Not similar enough
return {
'severity': 'medium',
'difference': f"Parameter {i+1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
'suggestion': f"Update documentation to use parameter name '{code_name}'"
"severity": "medium",
"difference": f"Parameter {i + 1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code",
"suggestion": f"Update documentation to use parameter name '{code_name}'",
}
# Type mismatch
doc_type = doc_param.get('type')
code_type = code_param.get('type_hint')
doc_type = doc_param.get("type")
code_type = code_param.get("type_hint")
if doc_type and code_type and doc_type != code_type:
return {
'severity': 'low',
'difference': f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
'suggestion': f"Verify correct type for parameter '{doc_name}'"
"severity": "low",
"difference": f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code",
"suggestion": f"Verify correct type for parameter '{doc_name}'",
}
# Compare return types if both have them
docs_return = docs_info.get('return_type')
code_return = code_info.get('return_type')
docs_return = docs_info.get("return_type")
code_return = code_info.get("return_type")
if docs_return and code_return and docs_return != code_return:
return {
'severity': 'low',
'difference': f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
'suggestion': "Verify correct return type"
"severity": "low",
"difference": f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code",
"suggestion": "Verify correct return type",
}
return None
def generate_summary(self, conflicts: List[Conflict]) -> Dict[str, Any]:
def generate_summary(self, conflicts: list[Conflict]) -> dict[str, Any]:
"""
Generate summary statistics for conflicts.
@@ -434,25 +444,25 @@ class ConflictDetector:
Summary dict with statistics
"""
summary = {
'total': len(conflicts),
'by_type': {},
'by_severity': {},
'apis_affected': len(set(c.api_name for c in conflicts))
"total": len(conflicts),
"by_type": {},
"by_severity": {},
"apis_affected": len(set(c.api_name for c in conflicts)),
}
# Count by type
for conflict_type in ['missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch']:
for conflict_type in ["missing_in_docs", "missing_in_code", "signature_mismatch", "description_mismatch"]:
count = sum(1 for c in conflicts if c.type == conflict_type)
summary['by_type'][conflict_type] = count
summary["by_type"][conflict_type] = count
# Count by severity
for severity in ['low', 'medium', 'high']:
for severity in ["low", "medium", "high"]:
count = sum(1 for c in conflicts if c.severity == severity)
summary['by_severity'][severity] = count
summary["by_severity"][severity] = count
return summary
def save_conflicts(self, conflicts: List[Conflict], output_path: str):
def save_conflicts(self, conflicts: list[Conflict], output_path: str):
"""
Save conflicts to JSON file.
@@ -460,18 +470,15 @@ class ConflictDetector:
conflicts: List of Conflict objects
output_path: Path to output JSON file
"""
data = {
'conflicts': [asdict(c) for c in conflicts],
'summary': self.generate_summary(conflicts)
}
data = {"conflicts": [asdict(c) for c in conflicts], "summary": self.generate_summary(conflicts)}
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Conflicts saved to: {output_path}")
if __name__ == '__main__':
if __name__ == "__main__":
import sys
if len(sys.argv) < 3:
@@ -482,10 +489,10 @@ if __name__ == '__main__':
github_file = sys.argv[2]
# Load data
with open(docs_file, 'r') as f:
with open(docs_file) as f:
docs_data = json.load(f)
with open(github_file, 'r') as f:
with open(github_file) as f:
github_data = json.load(f)
# Detect conflicts
@@ -498,16 +505,16 @@ if __name__ == '__main__':
print(f" Total conflicts: {summary['total']}")
print(f" APIs affected: {summary['apis_affected']}")
print("\n By Type:")
for conflict_type, count in summary['by_type'].items():
for conflict_type, count in summary["by_type"].items():
if count > 0:
print(f" {conflict_type}: {count}")
print("\n By Severity:")
for severity, count in summary['by_severity'].items():
for severity, count in summary["by_severity"].items():
if count > 0:
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
print(f" {emoji} {severity}: {count}")
# Save to file
output_file = 'conflicts.json'
output_file = "conflicts.json"
detector.save_conflicts(conflicts, output_file)
print(f"\n✅ Full report saved to: {output_file}")