- Formatted 103 files to comply with ruff format requirements - No code logic changes, only formatting/whitespace - Fixes CI formatting check failures
470 lines
14 KiB
Python
470 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Multi-language Documentation Support
|
||
|
||
Provides language detection, multi-language structure handling,
|
||
and translation-ready format generation.
|
||
"""
|
||
|
||
import re
|
||
from pathlib import Path
|
||
from dataclasses import dataclass
|
||
import json
|
||
|
||
|
||
@dataclass
|
||
class LanguageInfo:
|
||
"""Language information for a document."""
|
||
|
||
code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh')
|
||
name: str # Full name (e.g., 'English', 'Spanish', 'Chinese')
|
||
confidence: float # Detection confidence (0.0-1.0)
|
||
script: str | None = None # Script type (e.g., 'Latin', 'Cyrillic')
|
||
|
||
|
||
@dataclass
|
||
class TranslationStatus:
|
||
"""Translation status for a document."""
|
||
|
||
source_language: str
|
||
target_languages: list[str]
|
||
translated_languages: set[str]
|
||
missing_languages: set[str]
|
||
completeness: float # Percentage (0.0-1.0)
|
||
|
||
|
||
class LanguageDetector:
|
||
"""
|
||
Detect document language using heuristics.
|
||
|
||
Uses character patterns, common words, and script detection.
|
||
"""
|
||
|
||
# Common word patterns by language
|
||
LANGUAGE_PATTERNS = {
|
||
"en": [
|
||
r"\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b",
|
||
r"\b(this|that|these|those|what|which|who|where|when)\b",
|
||
],
|
||
"es": [
|
||
r"\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b",
|
||
r"\b(que|no|un|una|como|más|pero|muy|todo|ya)\b",
|
||
],
|
||
"fr": [
|
||
r"\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b",
|
||
r"\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b",
|
||
],
|
||
"de": [
|
||
r"\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b",
|
||
r"\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b",
|
||
],
|
||
"zh": [
|
||
r"[\u4e00-\u9fff]", # Chinese characters
|
||
r"(的|了|和|是|在|有|我|他|不|这)",
|
||
],
|
||
"ja": [
|
||
r"[\u3040-\u309f]", # Hiragana
|
||
r"[\u30a0-\u30ff]", # Katakana
|
||
r"[\u4e00-\u9faf]", # Kanji
|
||
],
|
||
"ko": [
|
||
r"[\uac00-\ud7af]", # Hangul
|
||
r"(의|가|이|은|들|는|좀|잘|께|을)",
|
||
],
|
||
"ru": [
|
||
r"[\u0400-\u04ff]", # Cyrillic
|
||
r"\b(и|в|не|на|с|что|он|по|а|как|это|все)\b",
|
||
],
|
||
"pt": [
|
||
r"\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b",
|
||
r"\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b",
|
||
],
|
||
"it": [
|
||
r"\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b",
|
||
r"\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b",
|
||
],
|
||
"ar": [
|
||
r"[\u0600-\u06ff]", # Arabic
|
||
r"(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)",
|
||
],
|
||
}
|
||
|
||
# Language names
|
||
LANGUAGE_NAMES = {
|
||
"en": "English",
|
||
"es": "Spanish",
|
||
"fr": "French",
|
||
"de": "German",
|
||
"zh": "Chinese",
|
||
"ja": "Japanese",
|
||
"ko": "Korean",
|
||
"ru": "Russian",
|
||
"pt": "Portuguese",
|
||
"it": "Italian",
|
||
"ar": "Arabic",
|
||
}
|
||
|
||
# Script types
|
||
SCRIPTS = {
|
||
"en": "Latin",
|
||
"es": "Latin",
|
||
"fr": "Latin",
|
||
"de": "Latin",
|
||
"pt": "Latin",
|
||
"it": "Latin",
|
||
"zh": "Han",
|
||
"ja": "Japanese",
|
||
"ko": "Hangul",
|
||
"ru": "Cyrillic",
|
||
"ar": "Arabic",
|
||
}
|
||
|
||
def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo:
|
||
"""
|
||
Detect language of text.
|
||
|
||
Args:
|
||
text: Text to analyze
|
||
sample_size: Number of characters to sample
|
||
|
||
Returns:
|
||
LanguageInfo with detected language
|
||
"""
|
||
if not text.strip():
|
||
return LanguageInfo("en", "English", 0.0)
|
||
|
||
# Sample text for efficiency
|
||
sample = text[:sample_size].lower()
|
||
|
||
# Score each language
|
||
scores = {}
|
||
for lang_code, patterns in self.LANGUAGE_PATTERNS.items():
|
||
score = 0
|
||
for pattern in patterns:
|
||
matches = len(re.findall(pattern, sample, re.IGNORECASE))
|
||
score += matches
|
||
|
||
scores[lang_code] = score
|
||
|
||
# Find best match
|
||
if not scores or max(scores.values()) == 0:
|
||
# Default to English
|
||
return LanguageInfo("en", "English", 0.1)
|
||
|
||
best_lang = max(scores, key=scores.get)
|
||
total_score = sum(scores.values())
|
||
confidence = scores[best_lang] / total_score if total_score > 0 else 0.0
|
||
|
||
return LanguageInfo(
|
||
code=best_lang,
|
||
name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()),
|
||
confidence=min(confidence, 1.0),
|
||
script=self.SCRIPTS.get(best_lang),
|
||
)
|
||
|
||
def detect_from_filename(self, filename: str) -> str | None:
|
||
"""
|
||
Detect language from filename pattern.
|
||
|
||
Supports patterns like:
|
||
- file.en.md
|
||
- file_en.md
|
||
- en/file.md
|
||
- file-en.md
|
||
|
||
Args:
|
||
filename: Filename to analyze
|
||
|
||
Returns:
|
||
ISO 639-1 language code or None
|
||
"""
|
||
# Pattern: file.en.md
|
||
match = re.search(r"\.([a-z]{2})\.md$", filename)
|
||
if match and match.group(1) in self.LANGUAGE_NAMES:
|
||
return match.group(1)
|
||
|
||
# Pattern: file_en.md or file-en.md
|
||
match = re.search(r"[_-]([a-z]{2})\.md$", filename)
|
||
if match and match.group(1) in self.LANGUAGE_NAMES:
|
||
return match.group(1)
|
||
|
||
return None
|
||
|
||
|
||
class MultiLanguageManager:
|
||
"""
|
||
Manages multi-language documentation structure.
|
||
|
||
Organizes documents by language and tracks translations.
|
||
"""
|
||
|
||
def __init__(self):
|
||
"""Initialize multi-language manager."""
|
||
self.detector = LanguageDetector()
|
||
self.documents: dict[str, list[dict]] = {} # lang_code -> [docs]
|
||
self.primary_language: str | None = None
|
||
|
||
def add_document(
|
||
self,
|
||
file_path: str,
|
||
content: str,
|
||
metadata: dict | None = None,
|
||
force_language: str | None = None,
|
||
) -> None:
|
||
"""
|
||
Add document with language detection.
|
||
|
||
Args:
|
||
file_path: Path to document
|
||
content: Document content
|
||
metadata: Additional metadata
|
||
force_language: Override language detection
|
||
"""
|
||
# Detect language
|
||
if force_language:
|
||
lang_code = force_language
|
||
lang_info = LanguageInfo(
|
||
code=lang_code,
|
||
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||
confidence=1.0,
|
||
script=self.detector.SCRIPTS.get(lang_code),
|
||
)
|
||
else:
|
||
# Try filename pattern first
|
||
filename_lang = self.detector.detect_from_filename(file_path)
|
||
if filename_lang:
|
||
lang_code = filename_lang
|
||
lang_info = LanguageInfo(
|
||
code=lang_code,
|
||
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||
confidence=0.95,
|
||
script=self.detector.SCRIPTS.get(lang_code),
|
||
)
|
||
else:
|
||
# Detect from content
|
||
lang_info = self.detector.detect(content)
|
||
lang_code = lang_info.code
|
||
|
||
# Set primary language (first added or most common)
|
||
if self.primary_language is None:
|
||
self.primary_language = lang_code
|
||
|
||
# Store document
|
||
if lang_code not in self.documents:
|
||
self.documents[lang_code] = []
|
||
|
||
doc = {
|
||
"file_path": file_path,
|
||
"content": content,
|
||
"language": lang_info.code,
|
||
"language_name": lang_info.name,
|
||
"confidence": lang_info.confidence,
|
||
"script": lang_info.script,
|
||
"metadata": metadata or {},
|
||
}
|
||
|
||
self.documents[lang_code].append(doc)
|
||
|
||
def get_languages(self) -> list[str]:
|
||
"""Get list of detected languages."""
|
||
return sorted(self.documents.keys())
|
||
|
||
def get_document_count(self, language: str | None = None) -> int:
|
||
"""
|
||
Get document count for a language.
|
||
|
||
Args:
|
||
language: Language code (None for all)
|
||
|
||
Returns:
|
||
Number of documents
|
||
"""
|
||
if language:
|
||
return len(self.documents.get(language, []))
|
||
return sum(len(docs) for docs in self.documents.values())
|
||
|
||
def get_translation_status(self, base_language: str | None = None) -> TranslationStatus:
|
||
"""
|
||
Get translation status.
|
||
|
||
Args:
|
||
base_language: Base language (None for primary)
|
||
|
||
Returns:
|
||
Translation status summary
|
||
"""
|
||
base_lang = base_language or self.primary_language or "en"
|
||
|
||
all_languages = set(self.documents.keys())
|
||
base_count = self.get_document_count(base_lang)
|
||
|
||
if base_count == 0:
|
||
return TranslationStatus(
|
||
source_language=base_lang,
|
||
target_languages=[],
|
||
translated_languages=set(),
|
||
missing_languages=set(),
|
||
completeness=0.0,
|
||
)
|
||
|
||
# Check which languages have translations
|
||
translated = set()
|
||
for lang in all_languages:
|
||
if lang != base_lang and self.get_document_count(lang) > 0:
|
||
translated.add(lang)
|
||
|
||
# Commonly expected languages for completeness
|
||
expected_languages = {"en", "es", "fr", "de", "zh", "ja"}
|
||
missing = expected_languages - all_languages
|
||
|
||
completeness = len(all_languages) / len(expected_languages)
|
||
|
||
return TranslationStatus(
|
||
source_language=base_lang,
|
||
target_languages=list(all_languages - {base_lang}),
|
||
translated_languages=translated,
|
||
missing_languages=missing,
|
||
completeness=min(completeness, 1.0),
|
||
)
|
||
|
||
def export_by_language(self, output_dir: Path) -> dict[str, Path]:
|
||
"""
|
||
Export documents organized by language.
|
||
|
||
Args:
|
||
output_dir: Output directory
|
||
|
||
Returns:
|
||
Dictionary mapping language codes to output paths
|
||
"""
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
exports = {}
|
||
|
||
for lang_code, docs in self.documents.items():
|
||
lang_file = output_dir / f"documents_{lang_code}.json"
|
||
|
||
export_data = {
|
||
"language": lang_code,
|
||
"language_name": self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||
"document_count": len(docs),
|
||
"documents": docs,
|
||
}
|
||
|
||
lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False))
|
||
exports[lang_code] = lang_file
|
||
|
||
return exports
|
||
|
||
def generate_translation_report(self) -> str:
|
||
"""
|
||
Generate human-readable translation report.
|
||
|
||
Returns:
|
||
Formatted report string
|
||
"""
|
||
lines = ["=" * 60]
|
||
lines.append("MULTI-LANGUAGE DOCUMENTATION REPORT")
|
||
lines.append("=" * 60)
|
||
lines.append("")
|
||
|
||
# Summary
|
||
languages = self.get_languages()
|
||
total_docs = self.get_document_count()
|
||
|
||
lines.append("📊 Summary:")
|
||
lines.append(f" Languages: {len(languages)}")
|
||
lines.append(f" Total documents: {total_docs}")
|
||
lines.append(f" Primary language: {self.primary_language or 'Unknown'}")
|
||
lines.append("")
|
||
|
||
# Language breakdown
|
||
lines.append("🌍 Language Breakdown:")
|
||
for lang in languages:
|
||
count = self.get_document_count(lang)
|
||
lang_name = self.detector.LANGUAGE_NAMES.get(lang, lang.upper())
|
||
percentage = (count / total_docs * 100) if total_docs > 0 else 0
|
||
lines.append(f" {lang_name} ({lang}): {count} docs ({percentage:.1f}%)")
|
||
lines.append("")
|
||
|
||
# Translation status
|
||
status = self.get_translation_status()
|
||
lines.append("📝 Translation Status:")
|
||
lines.append(f" Source: {status.source_language}")
|
||
lines.append(f" Translated to: {', '.join(status.translated_languages) or 'None'}")
|
||
lines.append(f" Completeness: {status.completeness * 100:.1f}%")
|
||
|
||
if status.missing_languages:
|
||
lines.append(f" Missing: {', '.join(sorted(status.missing_languages))}")
|
||
lines.append("")
|
||
|
||
lines.append("=" * 60)
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def main():
|
||
"""CLI entry point for multi-language support."""
|
||
import argparse
|
||
from pathlib import Path
|
||
|
||
parser = argparse.ArgumentParser(description="Manage multi-language skill documents")
|
||
parser.add_argument("skill_dir", help="Path to skill directory")
|
||
parser.add_argument("--detect", action="store_true", help="Detect languages in skill")
|
||
parser.add_argument("--report", action="store_true", help="Generate translation report")
|
||
parser.add_argument("--export", help="Export by language to specified directory")
|
||
args = parser.parse_args()
|
||
|
||
skill_dir = Path(args.skill_dir)
|
||
if not skill_dir.exists():
|
||
print(f"❌ Error: Directory not found: {skill_dir}")
|
||
return 1
|
||
|
||
manager = MultiLanguageManager()
|
||
|
||
# Load skill documents
|
||
print("📥 Loading skill documents...")
|
||
skill_md = skill_dir / "SKILL.md"
|
||
if skill_md.exists():
|
||
manager.add_document(
|
||
"SKILL.md", skill_md.read_text(encoding="utf-8"), {"category": "overview"}
|
||
)
|
||
|
||
# Load reference files
|
||
refs_dir = skill_dir / "references"
|
||
if refs_dir.exists():
|
||
for ref_file in refs_dir.glob("*.md"):
|
||
manager.add_document(
|
||
ref_file.name, ref_file.read_text(encoding="utf-8"), {"category": ref_file.stem}
|
||
)
|
||
|
||
# Detect languages
|
||
if args.detect:
|
||
languages = manager.get_languages()
|
||
print(f"\n🌍 Detected languages: {', '.join(languages)}")
|
||
for lang in languages:
|
||
count = manager.get_document_count(lang)
|
||
print(f" {lang}: {count} documents")
|
||
|
||
# Generate report
|
||
if args.report:
|
||
print(manager.generate_translation_report())
|
||
|
||
# Export by language
|
||
if args.export:
|
||
output_dir = Path(args.export)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
exports = manager.export_by_language(output_dir)
|
||
print(f"\n✅ Exported {len(exports)} language files:")
|
||
for lang, path in exports.items():
|
||
print(f" {lang}: {path}")
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
sys.exit(main())
|