Files
claude-code-skills-reference/excel-automation/scripts/parse_complex_excel.py

279 lines
9.1 KiB
Python
Executable File

# /// script
# requires-python = ">=3.11"
# dependencies = []
# ///
"""
Parse complex xlsx/xlsm files using stdlib zipfile + xml.etree.
No external dependencies required — uses only Python standard library.
Usage:
uv run scripts/parse_complex_excel.py <excel_file> [sheet_name]
This handles files that openpyxl cannot open (corrupted DefinedNames,
complex VBA macros, investment bank financial models).
"""
import json
import re
import subprocess
import sys
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
# XML namespaces used in Office Open XML
MAIN_NS = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
REL_NS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
RELS_NS = 'http://schemas.openxmlformats.org/package/2006/relationships'
def verify_format(file_path: str) -> str:
"""Verify actual file format using the `file` command."""
result = subprocess.run(
['file', '--brief', file_path],
capture_output=True, text=True
)
return result.stdout.strip()
def list_sheets(zf: zipfile.ZipFile) -> list[dict]:
"""List all sheet names and their physical XML paths."""
wb_xml = ET.fromstring(zf.read('xl/workbook.xml'))
sheets_el = wb_xml.findall(f'.//{{{MAIN_NS}}}sheet')
rels_xml = ET.fromstring(zf.read('xl/_rels/workbook.xml.rels'))
rid_to_path = {}
for rel in rels_xml.findall(f'{{{RELS_NS}}}Relationship'):
rid_to_path[rel.get('Id')] = 'xl/' + rel.get('Target')
sheets = []
for s in sheets_el:
name = s.get('name')
rid = s.get(f'{{{REL_NS}}}id')
path = rid_to_path.get(rid, '?')
sheets.append({'name': name, 'rId': rid, 'path': path})
return sheets
def get_sheet_path(zf: zipfile.ZipFile, sheet_name: str) -> str:
"""Resolve a sheet name to its physical XML path inside the ZIP."""
# Step 1: workbook.xml — find rId for the named sheet
wb_xml = ET.fromstring(zf.read('xl/workbook.xml'))
sheets = wb_xml.findall(f'.//{{{MAIN_NS}}}sheet')
rid = None
for s in sheets:
if s.get('name') == sheet_name:
rid = s.get(f'{{{REL_NS}}}id')
break
if not rid:
available = [s.get('name') for s in sheets]
raise ValueError(
f"Sheet '{sheet_name}' not found. Available: {available}"
)
# Step 2: workbook.xml.rels — map rId to file path
rels_xml = ET.fromstring(zf.read('xl/_rels/workbook.xml.rels'))
for rel in rels_xml.findall(f'{{{RELS_NS}}}Relationship'):
if rel.get('Id') == rid:
return 'xl/' + rel.get('Target')
raise ValueError(f"No file mapping for {rid}")
def build_shared_strings(zf: zipfile.ZipFile) -> list[str]:
"""Build the shared strings lookup table."""
shared = []
try:
ss_xml = ET.fromstring(zf.read('xl/sharedStrings.xml'))
for si in ss_xml.findall(f'{{{MAIN_NS}}}si'):
shared.append(''.join(si.itertext()))
except KeyError:
pass # No shared strings in this file
return shared
def parse_cell_ref(ref: str) -> tuple[str, int]:
"""Parse 'AB123' into ('AB', 123)."""
match = re.match(r'^([A-Z]+)(\d+)$', ref)
if not match:
return ref, 0
return match.group(1), int(match.group(2))
def extract_cells(zf: zipfile.ZipFile, sheet_path: str,
shared: list[str]) -> dict[str, any]:
"""Extract all cell values from a sheet XML."""
sheet_xml = ET.fromstring(zf.read(sheet_path))
rows = sheet_xml.findall(f'.//{{{MAIN_NS}}}row')
data = {}
for row in rows:
for cell in row.findall(f'{{{MAIN_NS}}}c'):
ref = cell.get('r')
cell_type = cell.get('t') # "s" = shared string, None = number
val_el = cell.find(f'{{{MAIN_NS}}}v')
if val_el is not None and val_el.text:
if cell_type == 's':
idx = int(val_el.text)
data[ref] = shared[idx] if idx < len(shared) else f'[SSI:{idx}]'
elif cell_type == 'b':
data[ref] = bool(int(val_el.text))
else:
try:
num = float(val_el.text)
data[ref] = int(num) if num == int(num) else num
except ValueError:
data[ref] = val_el.text
return data
def extract_rows(cells: dict, start_row: int = 1,
end_row: int | None = None) -> list[dict]:
"""Organize cells into row-based structure for easier consumption."""
# Determine row range
all_rows = set()
for ref in cells:
_, row_num = parse_cell_ref(ref)
if row_num > 0:
all_rows.add(row_num)
if not all_rows:
return []
start = max(start_row, min(all_rows))
end = min(end_row, max(all_rows)) if end_row else max(all_rows)
rows = []
for r in range(start, end + 1):
row_cells = {
ref: val for ref, val in cells.items()
if parse_cell_ref(ref)[1] == r
}
if row_cells:
rows.append({'row': r, 'cells': row_cells})
return rows
def fix_defined_names(input_path: str, output_path: str) -> int:
"""
Remove corrupted DefinedNames entries (containing "Formula removed")
and repackage the file.
Returns the number of removed entries.
"""
import shutil
import tempfile
with tempfile.TemporaryDirectory() as tmp_str:
tmp = Path(tmp_str)
# Extract
with zipfile.ZipFile(input_path, 'r') as zf:
zf.extractall(tmp)
# Fix workbook.xml
wb_path = tmp / 'xl' / 'workbook.xml'
tree = ET.parse(wb_path)
root = tree.getroot()
ns = {'main': MAIN_NS}
defined_names = root.find('.//main:definedNames', ns)
removed = 0
if defined_names is not None:
for name in list(defined_names):
if name.text and "Formula removed" in name.text:
defined_names.remove(name)
removed += 1
tree.write(wb_path, encoding='utf-8', xml_declaration=True)
# Repackage
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for fp in tmp.rglob('*'):
if fp.is_file():
zf.write(fp, fp.relative_to(tmp))
return removed
# ── CLI Entry Point ──────────────────────────────────────────────────
def main():
if len(sys.argv) < 2:
print("Usage: parse_complex_excel.py <excel_file> [sheet_name]")
print("\nExamples:")
print(" parse_complex_excel.py model.xlsm # List all sheets")
print(" parse_complex_excel.py model.xlsm DCF # Extract DCF sheet")
print(" parse_complex_excel.py model.xlsm --fix # Fix corrupted names")
sys.exit(1)
file_path = sys.argv[1]
path = Path(file_path)
if not path.exists():
print(f"File not found: {file_path}")
sys.exit(1)
# Verify format
fmt = verify_format(file_path)
print(f"File: {path.name}")
print(f"Format: {fmt}")
# "Microsoft Excel 2007+" = ZIP-based xlsx/xlsm
# "Zip archive" = generic ZIP (also valid)
# "Composite Document File" = old BIFF .xls format
is_zip_based = any(kw in fmt.lower() for kw in ['zip', 'excel 2007', 'ooxml'])
if not is_zip_based:
print("WARNING: File is not ZIP-based xlsx/xlsm.")
if 'composite' in fmt.lower() or 'biff' in fmt.lower():
print("This appears to be an old .xls (BIFF format). Use xlrd instead.")
else:
print(f"Unexpected format. If it should be xlsx/xlsm, check the file.")
sys.exit(1)
# Handle --fix flag
if len(sys.argv) > 2 and sys.argv[2] == '--fix':
out_path = str(path.with_stem(path.stem + '_fixed'))
removed = fix_defined_names(file_path, out_path)
print(f"Removed {removed} corrupted DefinedNames entries.")
print(f"Fixed file: {out_path}")
sys.exit(0)
with zipfile.ZipFile(file_path, 'r') as zf:
# List sheets
sheets = list_sheets(zf)
print(f"\nSheets ({len(sheets)}):")
for i, s in enumerate(sheets, 1):
print(f" {i}. {s['name']}{s['path']}")
# If sheet name given, extract it
if len(sys.argv) > 2:
sheet_name = sys.argv[2]
print(f"\nExtracting sheet: {sheet_name}")
sheet_path = get_sheet_path(zf, sheet_name)
shared = build_shared_strings(zf)
cells = extract_cells(zf, sheet_path, shared)
print(f"Total cells: {len(cells)}")
# Show first 20 rows
rows = extract_rows(cells, start_row=1, end_row=20)
for row in rows:
print(f" Row {row['row']:3d}: ", end="")
items = sorted(row['cells'].items(),
key=lambda x: parse_cell_ref(x[0]))
for ref, val in items[:8]:
val_str = str(val)[:25]
print(f"{ref}={val_str} ", end="")
print()
if __name__ == "__main__":
main()