fix: Add UTF-8 encoding to all file operations for Windows compatibility
Fixes #209 - UnicodeDecodeError on Windows with non-ASCII characters **Problem:** Windows users with non-English locales (Chinese, Japanese, Korean, etc.) experienced GBK/SHIFT-JIS codec errors when the system default encoding is not UTF-8. Error: 'gbk' codec can't decode byte 0xac in position 206: illegal multibyte sequence **Root Cause:** File operations using open() without explicit encoding parameter use the system default encoding, which on Windows Chinese edition is GBK. JSON files contain UTF-8 encoded characters that fail to decode with GBK. **Solution:** Added encoding='utf-8' to ALL file operations across: - doc_scraper.py (4 instances): * load_config() - line 1310 * check_existing_data() - line 1416 * save_checkpoint() - line 173 * load_checkpoint() - line 186 - github_scraper.py (1 instance): * main() config loading - line 922 - unified_scraper.py (10 instances): * All JSON read/write operations - lines 134, 153, 205, 239, 275, 278, 325, 328, 342, 364 **Test Results:** - ✅ All 612 tests passing (100% pass rate) - ✅ Backward compatible (UTF-8 is standard on Linux/macOS) - ✅ Fixes Windows locale issues **Impact:** - ✅ Works on ALL Windows locales (Chinese, Japanese, Korean, etc.) - ✅ Maintains compatibility with Linux/macOS - ✅ Prevents future encoding issues **Thanks to:** @my5icol for the detailed bug report and fix suggestion!
This commit is contained in:
@@ -170,7 +170,7 @@ class DocToSkillConverter:
|
||||
}
|
||||
|
||||
try:
|
||||
with open(self.checkpoint_file, 'w') as f:
|
||||
with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(checkpoint_data, f, indent=2)
|
||||
logger.info(" 💾 Checkpoint saved (%d pages)", self.pages_scraped)
|
||||
except Exception as e:
|
||||
@@ -183,7 +183,7 @@ class DocToSkillConverter:
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.checkpoint_file, 'r') as f:
|
||||
with open(self.checkpoint_file, 'r', encoding='utf-8') as f:
|
||||
checkpoint_data = json.load(f)
|
||||
|
||||
self.visited_urls = set(checkpoint_data["visited_urls"])
|
||||
@@ -1307,7 +1307,7 @@ def load_config(config_path: str) -> Dict[str, Any]:
|
||||
'react'
|
||||
"""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error("❌ Error: Invalid JSON in config file: %s", config_path)
|
||||
@@ -1413,7 +1413,7 @@ def check_existing_data(name: str) -> Tuple[bool, int]:
|
||||
"""
|
||||
data_dir = f"output/{name}_data"
|
||||
if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"):
|
||||
with open(f"{data_dir}/summary.json", 'r') as f:
|
||||
with open(f"{data_dir}/summary.json", 'r', encoding='utf-8') as f:
|
||||
summary = json.load(f)
|
||||
return True, summary.get('total_pages', 0)
|
||||
return False, 0
|
||||
|
||||
@@ -919,7 +919,7 @@ Examples:
|
||||
|
||||
# Build config from args or file
|
||||
if args.config:
|
||||
with open(args.config, 'r') as f:
|
||||
with open(args.config, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
elif args.repo:
|
||||
config = {
|
||||
|
||||
@@ -131,7 +131,7 @@ class UnifiedScraper:
|
||||
|
||||
# Write temporary config
|
||||
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
|
||||
with open(temp_config_path, 'w') as f:
|
||||
with open(temp_config_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(doc_config, f, indent=2)
|
||||
|
||||
# Run doc_scraper as subprocess
|
||||
@@ -150,7 +150,7 @@ class UnifiedScraper:
|
||||
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
||||
|
||||
if os.path.exists(docs_data_file):
|
||||
with open(docs_data_file, 'r') as f:
|
||||
with open(docs_data_file, 'r', encoding='utf-8') as f:
|
||||
summary = json.load(f)
|
||||
|
||||
self.scraped_data['documentation'] = {
|
||||
@@ -202,7 +202,7 @@ class UnifiedScraper:
|
||||
|
||||
# Save data
|
||||
github_data_file = os.path.join(self.data_dir, 'github_data.json')
|
||||
with open(github_data_file, 'w') as f:
|
||||
with open(github_data_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
self.scraped_data['github'] = {
|
||||
@@ -236,7 +236,7 @@ class UnifiedScraper:
|
||||
|
||||
# Save data
|
||||
pdf_data_file = os.path.join(self.data_dir, 'pdf_data.json')
|
||||
with open(pdf_data_file, 'w') as f:
|
||||
with open(pdf_data_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
self.scraped_data['pdf'] = {
|
||||
@@ -272,10 +272,10 @@ class UnifiedScraper:
|
||||
return []
|
||||
|
||||
# Load data files
|
||||
with open(docs_data['data_file'], 'r') as f:
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r') as f:
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Detect conflicts
|
||||
@@ -322,10 +322,10 @@ class UnifiedScraper:
|
||||
github_data = self.scraped_data.get('github', {})
|
||||
|
||||
# Load data
|
||||
with open(docs_data['data_file'], 'r') as f:
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r') as f:
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Choose merger
|
||||
@@ -339,7 +339,7 @@ class UnifiedScraper:
|
||||
|
||||
# Save merged data
|
||||
merged_file = os.path.join(self.data_dir, 'merged_data.json')
|
||||
with open(merged_file, 'w') as f:
|
||||
with open(merged_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"✅ Merged data saved: {merged_file}")
|
||||
@@ -361,7 +361,7 @@ class UnifiedScraper:
|
||||
conflicts = []
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
if os.path.exists(conflicts_file):
|
||||
with open(conflicts_file, 'r') as f:
|
||||
with open(conflicts_file, 'r', encoding='utf-8') as f:
|
||||
conflicts_data = json.load(f)
|
||||
conflicts = conflicts_data.get('conflicts', [])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user