fix: Add UTF-8 encoding to all file operations for Windows compatibility

Fixes #209 - UnicodeDecodeError on Windows with non-ASCII characters

**Problem:**
Windows users with non-English locales (Chinese, Japanese, Korean, etc.)
experienced GBK/SHIFT-JIS codec errors when the system default encoding
is not UTF-8.

Error: 'gbk' codec can't decode byte 0xac in position 206: illegal
multibyte sequence

**Root Cause:**
File operations using open() without explicit encoding parameter use
the system default encoding, which on Windows Chinese edition is GBK.
JSON files contain UTF-8 encoded characters that fail to decode with GBK.

**Solution:**
Added encoding='utf-8' to ALL file operations across:
- doc_scraper.py (4 instances):
  * load_config() - line 1310
  * check_existing_data() - line 1416
  * save_checkpoint() - line 173
  * load_checkpoint() - line 186

- github_scraper.py (1 instance):
  * main() config loading - line 922

- unified_scraper.py (10 instances):
  * All JSON read/write operations - lines 134, 153, 205, 239, 275,
    278, 325, 328, 342, 364

**Test Results:**
-  All 612 tests passing (100% pass rate)
-  Backward compatible (UTF-8 is standard on Linux/macOS)
-  Fixes Windows locale issues

**Impact:**
-  Works on ALL Windows locales (Chinese, Japanese, Korean, etc.)
-  Maintains compatibility with Linux/macOS
-  Prevents future encoding issues

**Thanks to:** @my5icol for the detailed bug report and fix suggestion!
This commit is contained in:
yusyus
2025-12-28 18:27:50 +03:00
parent eb3b9d9175
commit c411eb24ec
3 changed files with 15 additions and 15 deletions

View File

@@ -131,7 +131,7 @@ class UnifiedScraper:
# Write temporary config
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
with open(temp_config_path, 'w') as f:
with open(temp_config_path, 'w', encoding='utf-8') as f:
json.dump(doc_config, f, indent=2)
# Run doc_scraper as subprocess
@@ -150,7 +150,7 @@ class UnifiedScraper:
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
if os.path.exists(docs_data_file):
with open(docs_data_file, 'r') as f:
with open(docs_data_file, 'r', encoding='utf-8') as f:
summary = json.load(f)
self.scraped_data['documentation'] = {
@@ -202,7 +202,7 @@ class UnifiedScraper:
# Save data
github_data_file = os.path.join(self.data_dir, 'github_data.json')
with open(github_data_file, 'w') as f:
with open(github_data_file, 'w', encoding='utf-8') as f:
json.dump(github_data, f, indent=2, ensure_ascii=False)
self.scraped_data['github'] = {
@@ -236,7 +236,7 @@ class UnifiedScraper:
# Save data
pdf_data_file = os.path.join(self.data_dir, 'pdf_data.json')
with open(pdf_data_file, 'w') as f:
with open(pdf_data_file, 'w', encoding='utf-8') as f:
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
self.scraped_data['pdf'] = {
@@ -272,10 +272,10 @@ class UnifiedScraper:
return []
# Load data files
with open(docs_data['data_file'], 'r') as f:
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r') as f:
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
github_json = json.load(f)
# Detect conflicts
@@ -322,10 +322,10 @@ class UnifiedScraper:
github_data = self.scraped_data.get('github', {})
# Load data
with open(docs_data['data_file'], 'r') as f:
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r') as f:
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
github_json = json.load(f)
# Choose merger
@@ -339,7 +339,7 @@ class UnifiedScraper:
# Save merged data
merged_file = os.path.join(self.data_dir, 'merged_data.json')
with open(merged_file, 'w') as f:
with open(merged_file, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, indent=2, ensure_ascii=False)
logger.info(f"✅ Merged data saved: {merged_file}")
@@ -361,7 +361,7 @@ class UnifiedScraper:
conflicts = []
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
if os.path.exists(conflicts_file):
with open(conflicts_file, 'r') as f:
with open(conflicts_file, 'r', encoding='utf-8') as f:
conflicts_data = json.load(f)
conflicts = conflicts_data.get('conflicts', [])