279 lines
9.1 KiB
Python
Executable File
279 lines
9.1 KiB
Python
Executable File
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = []
|
|
# ///
|
|
"""
|
|
Parse complex xlsx/xlsm files using stdlib zipfile + xml.etree.
|
|
|
|
No external dependencies required — uses only Python standard library.
|
|
|
|
Usage:
|
|
uv run scripts/parse_complex_excel.py <excel_file> [sheet_name]
|
|
|
|
This handles files that openpyxl cannot open (corrupted DefinedNames,
|
|
complex VBA macros, investment bank financial models).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
# XML namespaces used in Office Open XML
|
|
MAIN_NS = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
|
|
REL_NS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
|
RELS_NS = 'http://schemas.openxmlformats.org/package/2006/relationships'
|
|
|
|
|
|
def verify_format(file_path: str) -> str:
|
|
"""Verify actual file format using the `file` command."""
|
|
result = subprocess.run(
|
|
['file', '--brief', file_path],
|
|
capture_output=True, text=True
|
|
)
|
|
return result.stdout.strip()
|
|
|
|
|
|
def list_sheets(zf: zipfile.ZipFile) -> list[dict]:
|
|
"""List all sheet names and their physical XML paths."""
|
|
wb_xml = ET.fromstring(zf.read('xl/workbook.xml'))
|
|
sheets_el = wb_xml.findall(f'.//{{{MAIN_NS}}}sheet')
|
|
|
|
rels_xml = ET.fromstring(zf.read('xl/_rels/workbook.xml.rels'))
|
|
rid_to_path = {}
|
|
for rel in rels_xml.findall(f'{{{RELS_NS}}}Relationship'):
|
|
rid_to_path[rel.get('Id')] = 'xl/' + rel.get('Target')
|
|
|
|
sheets = []
|
|
for s in sheets_el:
|
|
name = s.get('name')
|
|
rid = s.get(f'{{{REL_NS}}}id')
|
|
path = rid_to_path.get(rid, '?')
|
|
sheets.append({'name': name, 'rId': rid, 'path': path})
|
|
|
|
return sheets
|
|
|
|
|
|
def get_sheet_path(zf: zipfile.ZipFile, sheet_name: str) -> str:
|
|
"""Resolve a sheet name to its physical XML path inside the ZIP."""
|
|
# Step 1: workbook.xml — find rId for the named sheet
|
|
wb_xml = ET.fromstring(zf.read('xl/workbook.xml'))
|
|
sheets = wb_xml.findall(f'.//{{{MAIN_NS}}}sheet')
|
|
rid = None
|
|
for s in sheets:
|
|
if s.get('name') == sheet_name:
|
|
rid = s.get(f'{{{REL_NS}}}id')
|
|
break
|
|
if not rid:
|
|
available = [s.get('name') for s in sheets]
|
|
raise ValueError(
|
|
f"Sheet '{sheet_name}' not found. Available: {available}"
|
|
)
|
|
|
|
# Step 2: workbook.xml.rels — map rId to file path
|
|
rels_xml = ET.fromstring(zf.read('xl/_rels/workbook.xml.rels'))
|
|
for rel in rels_xml.findall(f'{{{RELS_NS}}}Relationship'):
|
|
if rel.get('Id') == rid:
|
|
return 'xl/' + rel.get('Target')
|
|
|
|
raise ValueError(f"No file mapping for {rid}")
|
|
|
|
|
|
def build_shared_strings(zf: zipfile.ZipFile) -> list[str]:
|
|
"""Build the shared strings lookup table."""
|
|
shared = []
|
|
try:
|
|
ss_xml = ET.fromstring(zf.read('xl/sharedStrings.xml'))
|
|
for si in ss_xml.findall(f'{{{MAIN_NS}}}si'):
|
|
shared.append(''.join(si.itertext()))
|
|
except KeyError:
|
|
pass # No shared strings in this file
|
|
return shared
|
|
|
|
|
|
def parse_cell_ref(ref: str) -> tuple[str, int]:
|
|
"""Parse 'AB123' into ('AB', 123)."""
|
|
match = re.match(r'^([A-Z]+)(\d+)$', ref)
|
|
if not match:
|
|
return ref, 0
|
|
return match.group(1), int(match.group(2))
|
|
|
|
|
|
def extract_cells(zf: zipfile.ZipFile, sheet_path: str,
|
|
shared: list[str]) -> dict[str, any]:
|
|
"""Extract all cell values from a sheet XML."""
|
|
sheet_xml = ET.fromstring(zf.read(sheet_path))
|
|
rows = sheet_xml.findall(f'.//{{{MAIN_NS}}}row')
|
|
|
|
data = {}
|
|
for row in rows:
|
|
for cell in row.findall(f'{{{MAIN_NS}}}c'):
|
|
ref = cell.get('r')
|
|
cell_type = cell.get('t') # "s" = shared string, None = number
|
|
val_el = cell.find(f'{{{MAIN_NS}}}v')
|
|
|
|
if val_el is not None and val_el.text:
|
|
if cell_type == 's':
|
|
idx = int(val_el.text)
|
|
data[ref] = shared[idx] if idx < len(shared) else f'[SSI:{idx}]'
|
|
elif cell_type == 'b':
|
|
data[ref] = bool(int(val_el.text))
|
|
else:
|
|
try:
|
|
num = float(val_el.text)
|
|
data[ref] = int(num) if num == int(num) else num
|
|
except ValueError:
|
|
data[ref] = val_el.text
|
|
|
|
return data
|
|
|
|
|
|
def extract_rows(cells: dict, start_row: int = 1,
|
|
end_row: int | None = None) -> list[dict]:
|
|
"""Organize cells into row-based structure for easier consumption."""
|
|
# Determine row range
|
|
all_rows = set()
|
|
for ref in cells:
|
|
_, row_num = parse_cell_ref(ref)
|
|
if row_num > 0:
|
|
all_rows.add(row_num)
|
|
|
|
if not all_rows:
|
|
return []
|
|
|
|
start = max(start_row, min(all_rows))
|
|
end = min(end_row, max(all_rows)) if end_row else max(all_rows)
|
|
|
|
rows = []
|
|
for r in range(start, end + 1):
|
|
row_cells = {
|
|
ref: val for ref, val in cells.items()
|
|
if parse_cell_ref(ref)[1] == r
|
|
}
|
|
if row_cells:
|
|
rows.append({'row': r, 'cells': row_cells})
|
|
|
|
return rows
|
|
|
|
|
|
def fix_defined_names(input_path: str, output_path: str) -> int:
|
|
"""
|
|
Remove corrupted DefinedNames entries (containing "Formula removed")
|
|
and repackage the file.
|
|
|
|
Returns the number of removed entries.
|
|
"""
|
|
import shutil
|
|
import tempfile
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_str:
|
|
tmp = Path(tmp_str)
|
|
|
|
# Extract
|
|
with zipfile.ZipFile(input_path, 'r') as zf:
|
|
zf.extractall(tmp)
|
|
|
|
# Fix workbook.xml
|
|
wb_path = tmp / 'xl' / 'workbook.xml'
|
|
tree = ET.parse(wb_path)
|
|
root = tree.getroot()
|
|
|
|
ns = {'main': MAIN_NS}
|
|
defined_names = root.find('.//main:definedNames', ns)
|
|
removed = 0
|
|
if defined_names is not None:
|
|
for name in list(defined_names):
|
|
if name.text and "Formula removed" in name.text:
|
|
defined_names.remove(name)
|
|
removed += 1
|
|
|
|
tree.write(wb_path, encoding='utf-8', xml_declaration=True)
|
|
|
|
# Repackage
|
|
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
for fp in tmp.rglob('*'):
|
|
if fp.is_file():
|
|
zf.write(fp, fp.relative_to(tmp))
|
|
|
|
return removed
|
|
|
|
|
|
# ── CLI Entry Point ──────────────────────────────────────────────────
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: parse_complex_excel.py <excel_file> [sheet_name]")
|
|
print("\nExamples:")
|
|
print(" parse_complex_excel.py model.xlsm # List all sheets")
|
|
print(" parse_complex_excel.py model.xlsm DCF # Extract DCF sheet")
|
|
print(" parse_complex_excel.py model.xlsm --fix # Fix corrupted names")
|
|
sys.exit(1)
|
|
|
|
file_path = sys.argv[1]
|
|
path = Path(file_path)
|
|
|
|
if not path.exists():
|
|
print(f"File not found: {file_path}")
|
|
sys.exit(1)
|
|
|
|
# Verify format
|
|
fmt = verify_format(file_path)
|
|
print(f"File: {path.name}")
|
|
print(f"Format: {fmt}")
|
|
|
|
# "Microsoft Excel 2007+" = ZIP-based xlsx/xlsm
|
|
# "Zip archive" = generic ZIP (also valid)
|
|
# "Composite Document File" = old BIFF .xls format
|
|
is_zip_based = any(kw in fmt.lower() for kw in ['zip', 'excel 2007', 'ooxml'])
|
|
if not is_zip_based:
|
|
print("WARNING: File is not ZIP-based xlsx/xlsm.")
|
|
if 'composite' in fmt.lower() or 'biff' in fmt.lower():
|
|
print("This appears to be an old .xls (BIFF format). Use xlrd instead.")
|
|
else:
|
|
print(f"Unexpected format. If it should be xlsx/xlsm, check the file.")
|
|
sys.exit(1)
|
|
|
|
# Handle --fix flag
|
|
if len(sys.argv) > 2 and sys.argv[2] == '--fix':
|
|
out_path = str(path.with_stem(path.stem + '_fixed'))
|
|
removed = fix_defined_names(file_path, out_path)
|
|
print(f"Removed {removed} corrupted DefinedNames entries.")
|
|
print(f"Fixed file: {out_path}")
|
|
sys.exit(0)
|
|
|
|
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
# List sheets
|
|
sheets = list_sheets(zf)
|
|
print(f"\nSheets ({len(sheets)}):")
|
|
for i, s in enumerate(sheets, 1):
|
|
print(f" {i}. {s['name']} → {s['path']}")
|
|
|
|
# If sheet name given, extract it
|
|
if len(sys.argv) > 2:
|
|
sheet_name = sys.argv[2]
|
|
print(f"\nExtracting sheet: {sheet_name}")
|
|
|
|
sheet_path = get_sheet_path(zf, sheet_name)
|
|
shared = build_shared_strings(zf)
|
|
cells = extract_cells(zf, sheet_path, shared)
|
|
|
|
print(f"Total cells: {len(cells)}")
|
|
|
|
# Show first 20 rows
|
|
rows = extract_rows(cells, start_row=1, end_row=20)
|
|
for row in rows:
|
|
print(f" Row {row['row']:3d}: ", end="")
|
|
items = sorted(row['cells'].items(),
|
|
key=lambda x: parse_cell_ref(x[0]))
|
|
for ref, val in items[:8]:
|
|
val_str = str(val)[:25]
|
|
print(f"{ref}={val_str} ", end="")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|