Import the official Hugging Face ecosystem skills and sync the\nexisting local coverage with upstream metadata and assets.\n\nRegenerate the canonical catalog, plugin mirrors, docs, and release\nnotes after the maintainer merge batch so main stays in sync.\n\nFixes #417
815 lines
31 KiB
Python
815 lines
31 KiB
Python
#!/usr/bin/env python3
|
|
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = []
|
|
# ///
|
|
"""
|
|
Dataset Format Inspector for Vision Model Training
|
|
|
|
Inspects Hugging Face datasets to determine compatibility with object detection
|
|
and image classification training.
|
|
Uses Datasets Server API for instant results - no dataset download needed!
|
|
|
|
ULTRA-EFFICIENT: Uses HF Datasets Server API - completes in <2 seconds.
|
|
|
|
Usage with HF Jobs:
|
|
hf_jobs("uv", {
|
|
"script": "path/to/dataset_inspector.py",
|
|
"script_args": ["--dataset", "your/dataset", "--split", "train"]
|
|
})
|
|
"""
|
|
|
|
import argparse
|
|
import math
|
|
import sys
|
|
import json
|
|
import urllib.request
|
|
import urllib.parse
|
|
from typing import List, Dict, Any, Tuple
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Inspect dataset format for vision model training")
|
|
parser.add_argument("--dataset", type=str, required=True, help="Dataset name")
|
|
parser.add_argument("--split", type=str, default="train", help="Dataset split (default: train)")
|
|
parser.add_argument("--config", type=str, default="default", help="Dataset config name (default: default)")
|
|
parser.add_argument("--preview", type=int, default=150, help="Max chars per field preview")
|
|
parser.add_argument("--samples", type=int, default=5, help="Number of samples to fetch (default: 5)")
|
|
parser.add_argument("--json-output", action="store_true", help="Output as JSON")
|
|
return parser.parse_args()
|
|
|
|
|
|
def api_request(url: str) -> Dict:
|
|
"""Make API request to Datasets Server"""
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
return json.loads(response.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
return None
|
|
raise Exception(f"API request failed: {e.code} {e.reason}")
|
|
except Exception as e:
|
|
raise Exception(f"API request failed: {str(e)}")
|
|
|
|
|
|
def get_splits(dataset: str) -> Dict:
|
|
"""Get available splits for dataset"""
|
|
url = f"https://datasets-server.huggingface.co/splits?dataset={urllib.parse.quote(dataset)}"
|
|
return api_request(url)
|
|
|
|
|
|
def get_rows(dataset: str, config: str, split: str, offset: int = 0, length: int = 5) -> Dict:
|
|
"""Get rows from dataset"""
|
|
url = f"https://datasets-server.huggingface.co/rows?dataset={urllib.parse.quote(dataset)}&config={config}&split={split}&offset={offset}&length={length}"
|
|
return api_request(url)
|
|
|
|
|
|
def find_columns(columns: List[str], patterns: List[str]) -> List[str]:
|
|
"""Find columns matching patterns"""
|
|
return [c for c in columns if any(p in c.lower() for p in patterns)]
|
|
|
|
|
|
def detect_bbox_format(bbox: List[float], image_size: Tuple[int, int] = None) -> str:
|
|
"""
|
|
Detect bounding box format based on values and optionally image dimensions.
|
|
Common formats:
|
|
- [x_min, y_min, x_max, y_max] - XYXY (Pascal VOC)
|
|
- [x_min, y_min, width, height] - XYWH (COCO)
|
|
- [x_center, y_center, width, height] - CXCYWH (YOLO normalized)
|
|
"""
|
|
if len(bbox) != 4:
|
|
return "unknown (not 4 values)"
|
|
|
|
a, b, c, d = bbox
|
|
|
|
is_normalized = all(0 <= v <= 1 for v in bbox)
|
|
|
|
if c < a or d < b:
|
|
if is_normalized:
|
|
return "xywh_normalized"
|
|
return "xywh (COCO style)"
|
|
|
|
# c > a and d > b — ambiguous between xyxy and xywh.
|
|
# Use image dimensions to disambiguate when available.
|
|
if image_size is not None:
|
|
img_w, img_h = image_size
|
|
# If interpreting as xywh, right edge = a + c; if that overshoots the
|
|
# image while c alone fits, the format is more likely xyxy.
|
|
xywh_exceeds = (a + c > img_w * 1.05) or (b + d > img_h * 1.05)
|
|
xyxy_exceeds = (c > img_w * 1.05) or (d > img_h * 1.05)
|
|
if xywh_exceeds and not xyxy_exceeds:
|
|
return "xyxy (Pascal VOC style)"
|
|
if xyxy_exceeds and not xywh_exceeds:
|
|
return "xywh (COCO style)"
|
|
|
|
if is_normalized:
|
|
return "xyxy_normalized"
|
|
return "xyxy (Pascal VOC style)"
|
|
|
|
|
|
def _extract_image_size(row: Dict) -> Tuple[int, int] | None:
|
|
"""Try to extract (width, height) from the image column returned by Datasets Server."""
|
|
for col in ("image", "img", "picture", "photo"):
|
|
img = row.get(col)
|
|
if isinstance(img, dict):
|
|
w = img.get("width")
|
|
h = img.get("height")
|
|
if isinstance(w, (int, float)) and isinstance(h, (int, float)):
|
|
return (int(w), int(h))
|
|
return None
|
|
|
|
|
|
def analyze_annotations(sample_rows: List[Dict], annotation_cols: List[str]) -> Dict[str, Any]:
|
|
"""Analyze annotation structure from sample rows"""
|
|
if not annotation_cols:
|
|
return {"found": False}
|
|
|
|
annotation_col = annotation_cols[0]
|
|
annotations_info = {
|
|
"found": True,
|
|
"column": annotation_col,
|
|
"sample_structures": [],
|
|
"bbox_formats": [],
|
|
"categories_found": [],
|
|
"avg_objects_per_image": 0,
|
|
"max_objects": 0,
|
|
"min_objects": float('inf'),
|
|
}
|
|
|
|
total_objects = 0
|
|
valid_samples = 0
|
|
|
|
for row in sample_rows:
|
|
ann = row["row"].get(annotation_col)
|
|
if not ann:
|
|
continue
|
|
|
|
valid_samples += 1
|
|
image_size = _extract_image_size(row["row"])
|
|
|
|
# Check if it's a list of annotations or a dict
|
|
if isinstance(ann, dict):
|
|
# COCO-style or structured annotation
|
|
sample_structure = {
|
|
"type": "dict",
|
|
"keys": list(ann.keys())
|
|
}
|
|
|
|
# Check for bounding boxes
|
|
if "bbox" in ann or "bboxes" in ann:
|
|
bbox_key = "bbox" if "bbox" in ann else "bboxes"
|
|
bboxes = ann[bbox_key]
|
|
if isinstance(bboxes, list) and len(bboxes) > 0:
|
|
if isinstance(bboxes[0], list):
|
|
# Multiple bboxes
|
|
num_objects = len(bboxes)
|
|
total_objects += num_objects
|
|
annotations_info["max_objects"] = max(annotations_info["max_objects"], num_objects)
|
|
annotations_info["min_objects"] = min(annotations_info["min_objects"], num_objects)
|
|
|
|
# Analyze first bbox format
|
|
bbox_format = detect_bbox_format(bboxes[0], image_size)
|
|
annotations_info["bbox_formats"].append(bbox_format)
|
|
else:
|
|
# Single bbox
|
|
total_objects += 1
|
|
annotations_info["max_objects"] = max(annotations_info["max_objects"], 1)
|
|
annotations_info["min_objects"] = min(annotations_info["min_objects"], 1)
|
|
bbox_format = detect_bbox_format(bboxes, image_size)
|
|
annotations_info["bbox_formats"].append(bbox_format)
|
|
|
|
# Check for categories/classes
|
|
for key in ["category", "categories", "label", "labels", "class", "classes", "category_id"]:
|
|
if key in ann:
|
|
cats = ann[key]
|
|
if isinstance(cats, list):
|
|
annotations_info["categories_found"].extend([str(c) for c in cats])
|
|
else:
|
|
annotations_info["categories_found"].append(str(cats))
|
|
|
|
annotations_info["sample_structures"].append(sample_structure)
|
|
|
|
elif isinstance(ann, list):
|
|
# List of annotation dicts
|
|
sample_structure = {
|
|
"type": "list",
|
|
"length": len(ann),
|
|
"item_type": type(ann[0]).__name__ if ann else None
|
|
}
|
|
|
|
if ann and isinstance(ann[0], dict):
|
|
sample_structure["item_keys"] = list(ann[0].keys())
|
|
|
|
# Count objects
|
|
num_objects = len(ann)
|
|
total_objects += num_objects
|
|
annotations_info["max_objects"] = max(annotations_info["max_objects"], num_objects)
|
|
annotations_info["min_objects"] = min(annotations_info["min_objects"], num_objects)
|
|
|
|
# Check first annotation
|
|
first_ann = ann[0]
|
|
if "bbox" in first_ann:
|
|
bbox_format = detect_bbox_format(first_ann["bbox"], image_size)
|
|
annotations_info["bbox_formats"].append(bbox_format)
|
|
|
|
# Check for categories
|
|
for key in ["category", "label", "class", "category_id"]:
|
|
if key in first_ann:
|
|
for item in ann:
|
|
if key in item:
|
|
annotations_info["categories_found"].append(str(item[key]))
|
|
|
|
annotations_info["sample_structures"].append(sample_structure)
|
|
|
|
if valid_samples > 0:
|
|
annotations_info["avg_objects_per_image"] = round(total_objects / valid_samples, 2)
|
|
|
|
if annotations_info["min_objects"] == float('inf'):
|
|
annotations_info["min_objects"] = 0
|
|
|
|
# Get unique categories
|
|
annotations_info["categories_found"] = list(set(annotations_info["categories_found"]))
|
|
annotations_info["num_classes"] = len(annotations_info["categories_found"])
|
|
|
|
# Get most common bbox format
|
|
if annotations_info["bbox_formats"]:
|
|
from collections import Counter
|
|
format_counts = Counter(annotations_info["bbox_formats"])
|
|
annotations_info["primary_bbox_format"] = format_counts.most_common(1)[0][0]
|
|
|
|
return annotations_info
|
|
|
|
|
|
def check_image_classification_compatibility(columns: List[str], sample_rows: List[Dict], features: List[Dict]) -> Dict[str, Any]:
|
|
"""Check image classification dataset compatibility"""
|
|
|
|
image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
|
|
has_image = len(image_cols) > 0
|
|
|
|
label_cols = find_columns(columns, ["label", "labels", "class", "fine_label", "coarse_label"])
|
|
has_label = len(label_cols) > 0
|
|
|
|
label_info: Dict[str, Any] = {"found": has_label}
|
|
|
|
if has_label:
|
|
label_col = label_cols[0]
|
|
label_info["column"] = label_col
|
|
|
|
# Detect whether label is ClassLabel (int with names) or plain int/string
|
|
for f in features:
|
|
if f.get("name") == label_col:
|
|
ftype = f.get("type", "")
|
|
if isinstance(ftype, dict) and ftype.get("_type") == "ClassLabel":
|
|
label_info["type"] = "ClassLabel"
|
|
names = ftype.get("names", [])
|
|
label_info["num_classes"] = len(names)
|
|
label_info["class_names"] = names[:20]
|
|
if len(names) > 20:
|
|
label_info["class_names_truncated"] = True
|
|
elif isinstance(ftype, dict) and ftype.get("dtype") in ("int64", "int32", "int8"):
|
|
label_info["type"] = "int"
|
|
elif isinstance(ftype, dict) and ftype.get("dtype") == "string":
|
|
label_info["type"] = "string"
|
|
break
|
|
|
|
# Discover unique labels from samples if ClassLabel info wasn't in features
|
|
if "num_classes" not in label_info:
|
|
unique = set()
|
|
for row in sample_rows:
|
|
val = row["row"].get(label_col)
|
|
if val is not None:
|
|
unique.add(val)
|
|
label_info["sample_unique_labels"] = sorted(unique, key=str)[:20]
|
|
label_info["sample_unique_count"] = len(unique)
|
|
|
|
ready = has_image and has_label
|
|
return {
|
|
"ready": ready,
|
|
"has_image": has_image,
|
|
"image_columns": image_cols,
|
|
"has_label": has_label,
|
|
"label_columns": label_cols,
|
|
"label_info": label_info,
|
|
}
|
|
|
|
|
|
def check_object_detection_compatibility(columns: List[str], sample_rows: List[Dict]) -> Dict[str, Any]:
|
|
"""Check object detection dataset compatibility"""
|
|
|
|
# Find image column
|
|
image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
|
|
has_image = len(image_cols) > 0
|
|
|
|
# Find annotation columns
|
|
annotation_cols = find_columns(columns, ["objects", "annotations", "ann", "bbox", "bboxes", "detection"])
|
|
has_annotations = len(annotation_cols) > 0
|
|
|
|
# Analyze annotations
|
|
annotations_info = analyze_annotations(sample_rows, annotation_cols) if has_annotations else {"found": False}
|
|
|
|
# Check for separate bbox and category columns
|
|
bbox_cols = find_columns(columns, ["bbox", "bboxes", "boxes"])
|
|
category_cols = find_columns(columns, ["category", "label", "class", "categories", "labels", "classes"])
|
|
|
|
# Determine readiness
|
|
ready = has_image and (has_annotations or (len(bbox_cols) > 0 and len(category_cols) > 0))
|
|
|
|
return {
|
|
"ready": ready,
|
|
"has_image": has_image,
|
|
"image_columns": image_cols,
|
|
"has_annotations": has_annotations,
|
|
"annotation_columns": annotation_cols,
|
|
"separate_bbox_columns": bbox_cols,
|
|
"separate_category_columns": category_cols,
|
|
"annotations_info": annotations_info,
|
|
}
|
|
|
|
|
|
def check_sam_segmentation_compatibility(columns: List[str], sample_rows: List[Dict], features: List[Dict]) -> Dict[str, Any]:
|
|
"""Check SAM/SAM2 segmentation dataset compatibility.
|
|
|
|
A valid SAM segmentation dataset needs:
|
|
- An image column
|
|
- A mask column (binary ground-truth segmentation mask)
|
|
- A prompt: either a bbox prompt or point prompt (in a JSON prompt column, or dedicated columns)
|
|
"""
|
|
|
|
image_cols = find_columns(columns, ["image", "img", "picture", "photo"])
|
|
has_image = len(image_cols) > 0
|
|
|
|
mask_cols = find_columns(columns, ["mask", "segmentation", "alpha", "matte"])
|
|
has_mask = len(mask_cols) > 0
|
|
|
|
prompt_cols = find_columns(columns, ["prompt"])
|
|
bbox_cols = [c for c in columns if c in ("bbox", "bboxes", "box", "boxes")]
|
|
point_cols = [c for c in columns if c in ("point", "points", "input_point", "input_points")]
|
|
|
|
prompt_info: Dict[str, Any] = {
|
|
"has_prompt": False,
|
|
"prompt_type": None,
|
|
"source": None,
|
|
"bbox_valid": None,
|
|
}
|
|
|
|
# Try JSON prompt column first
|
|
if prompt_cols:
|
|
for row in sample_rows:
|
|
raw = row["row"].get(prompt_cols[0])
|
|
if raw is None:
|
|
continue
|
|
parsed = raw if isinstance(raw, dict) else _try_json(raw)
|
|
if parsed is None:
|
|
continue
|
|
|
|
if isinstance(parsed, dict):
|
|
if "bbox" in parsed or "box" in parsed:
|
|
prompt_info["has_prompt"] = True
|
|
prompt_info["prompt_type"] = "bbox"
|
|
prompt_info["source"] = f"JSON column '{prompt_cols[0]}'"
|
|
bbox = parsed.get("bbox") or parsed.get("box")
|
|
prompt_info["bbox_valid"] = _validate_bbox(bbox, _extract_image_size(row["row"]))
|
|
break
|
|
elif "point" in parsed or "points" in parsed:
|
|
prompt_info["has_prompt"] = True
|
|
prompt_info["prompt_type"] = "point"
|
|
prompt_info["source"] = f"JSON column '{prompt_cols[0]}'"
|
|
break
|
|
|
|
if not prompt_info["has_prompt"] and bbox_cols:
|
|
prompt_info["has_prompt"] = True
|
|
prompt_info["prompt_type"] = "bbox"
|
|
prompt_info["source"] = f"column '{bbox_cols[0]}'"
|
|
for row in sample_rows:
|
|
bbox = row["row"].get(bbox_cols[0])
|
|
if bbox is not None:
|
|
prompt_info["bbox_valid"] = _validate_bbox(bbox, _extract_image_size(row["row"]))
|
|
break
|
|
|
|
if not prompt_info["has_prompt"] and point_cols:
|
|
prompt_info["has_prompt"] = True
|
|
prompt_info["prompt_type"] = "point"
|
|
prompt_info["source"] = f"column '{point_cols[0]}'"
|
|
|
|
ready = has_image and has_mask and prompt_info["has_prompt"]
|
|
|
|
return {
|
|
"ready": ready,
|
|
"has_image": has_image,
|
|
"image_columns": image_cols,
|
|
"has_mask": has_mask,
|
|
"mask_columns": mask_cols,
|
|
"prompt_columns": prompt_cols,
|
|
"bbox_columns": bbox_cols,
|
|
"point_columns": point_cols,
|
|
"prompt_info": prompt_info,
|
|
}
|
|
|
|
|
|
def _try_json(value) -> Any:
|
|
if not isinstance(value, str):
|
|
return None
|
|
try:
|
|
return json.loads(value)
|
|
except (json.JSONDecodeError, TypeError):
|
|
return None
|
|
|
|
|
|
def _validate_bbox(bbox, image_size=None) -> Dict[str, Any]:
|
|
"""Validate a single bounding box and return diagnostics."""
|
|
result: Dict[str, Any] = {"valid": False}
|
|
if not isinstance(bbox, (list, tuple)):
|
|
result["error"] = "bbox is not a list"
|
|
return result
|
|
if len(bbox) != 4:
|
|
result["error"] = f"expected 4 values, got {len(bbox)}"
|
|
return result
|
|
try:
|
|
vals = [float(v) for v in bbox]
|
|
except (TypeError, ValueError):
|
|
result["error"] = "non-numeric values"
|
|
return result
|
|
|
|
if not all(math.isfinite(v) for v in vals):
|
|
result["error"] = "contains non-finite values"
|
|
return result
|
|
|
|
x0, y0, x1, y1 = vals
|
|
if x1 <= x0 or y1 <= y0:
|
|
if vals[2] > 0 and vals[3] > 0:
|
|
result["format_hint"] = "likely xywh"
|
|
else:
|
|
result["error"] = "degenerate bbox (zero or negative area)"
|
|
return result
|
|
else:
|
|
result["format_hint"] = "likely xyxy"
|
|
|
|
if image_size is not None:
|
|
img_w, img_h = image_size
|
|
if any(v > max(img_w, img_h) * 1.5 for v in vals):
|
|
result["warning"] = "coordinates exceed image bounds"
|
|
|
|
result["valid"] = True
|
|
result["values"] = vals
|
|
return result
|
|
|
|
|
|
def generate_mapping_code(info: Dict[str, Any]) -> str:
|
|
"""Generate mapping code if needed"""
|
|
if info["ready"]:
|
|
ann_info = info["annotations_info"]
|
|
if not ann_info.get("found"):
|
|
return None
|
|
|
|
# Check if format conversion is needed
|
|
ann_col = ann_info.get("column")
|
|
bbox_format = ann_info.get("primary_bbox_format", "unknown")
|
|
|
|
if "coco" in bbox_format.lower() or "xywh" in bbox_format.lower():
|
|
# Already COCO format
|
|
return f"""# Dataset appears to be in COCO format (xywh)
|
|
# Image column: {info['image_columns'][0] if info['image_columns'] else 'image'}
|
|
# Annotation column: {ann_col}
|
|
# Use directly with transformers object detection models"""
|
|
elif "xyxy" in bbox_format.lower():
|
|
# Need to convert from XYXY to XYWH
|
|
return f"""# Convert from XYXY (Pascal VOC) to XYWH (COCO) format
|
|
def convert_to_coco_format(example):
|
|
annotations = example['{ann_col}']
|
|
if isinstance(annotations, list):
|
|
for ann in annotations:
|
|
if 'bbox' in ann:
|
|
x_min, y_min, x_max, y_max = ann['bbox']
|
|
ann['bbox'] = [x_min, y_min, x_max - x_min, y_max - y_min]
|
|
elif isinstance(annotations, dict) and 'bbox' in annotations:
|
|
bbox = annotations['bbox']
|
|
if isinstance(bbox, list) and len(bbox) > 0 and isinstance(bbox[0], list):
|
|
for i, box in enumerate(bbox):
|
|
x_min, y_min, x_max, y_max = box
|
|
bbox[i] = [x_min, y_min, x_max - x_min, y_max - y_min]
|
|
return example
|
|
|
|
dataset = dataset.map(convert_to_coco_format)"""
|
|
|
|
elif not info["ready"]:
|
|
# Need to create annotations structure
|
|
if info["separate_bbox_columns"] and info["separate_category_columns"]:
|
|
bbox_col = info["separate_bbox_columns"][0]
|
|
cat_col = info["separate_category_columns"][0]
|
|
|
|
return f"""# Combine separate bbox and category columns
|
|
def create_annotations(example):
|
|
bboxes = example['{bbox_col}']
|
|
categories = example['{cat_col}']
|
|
|
|
if not isinstance(bboxes, list):
|
|
bboxes = [bboxes]
|
|
if not isinstance(categories, list):
|
|
categories = [categories]
|
|
|
|
annotations = []
|
|
for bbox, cat in zip(bboxes, categories):
|
|
annotations.append({{'bbox': bbox, 'category': cat}})
|
|
|
|
example['objects'] = annotations
|
|
return example
|
|
|
|
dataset = dataset.map(create_annotations)"""
|
|
|
|
return None
|
|
|
|
|
|
def format_value_preview(value: Any, max_chars: int) -> str:
|
|
"""Format value for preview"""
|
|
if value is None:
|
|
return "None"
|
|
elif isinstance(value, str):
|
|
return value[:max_chars] + ("..." if len(value) > max_chars else "")
|
|
elif isinstance(value, dict):
|
|
keys = list(value.keys())
|
|
return f"{{dict with {len(keys)} keys: {', '.join(keys[:5])}}}"
|
|
elif isinstance(value, list):
|
|
if len(value) == 0:
|
|
return "[]"
|
|
elif isinstance(value[0], dict):
|
|
return f"[{len(value)} items] First item keys: {list(value[0].keys())}"
|
|
elif isinstance(value[0], list):
|
|
return f"[{len(value)} items] First item: {value[0]}"
|
|
else:
|
|
preview = str(value)
|
|
return preview[:max_chars] + ("..." if len(preview) > max_chars else "")
|
|
else:
|
|
preview = str(value)
|
|
return preview[:max_chars] + ("..." if len(preview) > max_chars else "")
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
print(f"Fetching dataset info via Datasets Server API...")
|
|
|
|
try:
|
|
# Get splits info
|
|
splits_data = get_splits(args.dataset)
|
|
if not splits_data or "splits" not in splits_data:
|
|
print(f"ERROR: Could not fetch splits for dataset '{args.dataset}'")
|
|
print(f" Dataset may not exist or is not accessible via Datasets Server API")
|
|
sys.exit(1)
|
|
|
|
# Find the right config
|
|
available_configs = set()
|
|
split_found = False
|
|
config_to_use = args.config
|
|
|
|
for split_info in splits_data["splits"]:
|
|
available_configs.add(split_info["config"])
|
|
if split_info["config"] == args.config and split_info["split"] == args.split:
|
|
split_found = True
|
|
|
|
# If default config not found, try first available
|
|
if not split_found and available_configs:
|
|
config_to_use = list(available_configs)[0]
|
|
print(f"Config '{args.config}' not found, trying '{config_to_use}'...")
|
|
|
|
# Get rows
|
|
rows_data = get_rows(args.dataset, config_to_use, args.split, offset=0, length=args.samples)
|
|
|
|
if not rows_data or "rows" not in rows_data:
|
|
print(f"ERROR: Could not fetch rows for dataset '{args.dataset}'")
|
|
print(f" Split '{args.split}' may not exist")
|
|
print(f" Available configs: {', '.join(sorted(available_configs))}")
|
|
sys.exit(1)
|
|
|
|
rows = rows_data["rows"]
|
|
if not rows:
|
|
print(f"ERROR: No rows found in split '{args.split}'")
|
|
sys.exit(1)
|
|
|
|
# Extract column info from first row
|
|
first_row = rows[0]["row"]
|
|
columns = list(first_row.keys())
|
|
features = rows_data.get("features", [])
|
|
|
|
# Get total count if available
|
|
total_examples = "Unknown"
|
|
for split_info in splits_data["splits"]:
|
|
if split_info["config"] == config_to_use and split_info["split"] == args.split:
|
|
total_examples = f"{split_info.get('num_examples', 'Unknown'):,}" if isinstance(split_info.get('num_examples'), int) else "Unknown"
|
|
break
|
|
|
|
except Exception as e:
|
|
print(f"ERROR: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
# Run compatibility checks
|
|
od_info = check_object_detection_compatibility(columns, rows)
|
|
ic_info = check_image_classification_compatibility(columns, rows, features)
|
|
sam_info = check_sam_segmentation_compatibility(columns, rows, features)
|
|
|
|
# JSON output mode
|
|
if args.json_output:
|
|
result = {
|
|
"dataset": args.dataset,
|
|
"config": config_to_use,
|
|
"split": args.split,
|
|
"total_examples": total_examples,
|
|
"columns": columns,
|
|
"features": [{"name": f["name"], "type": f["type"]} for f in features] if features else [],
|
|
"object_detection_compatibility": od_info,
|
|
"image_classification_compatibility": ic_info,
|
|
"sam_segmentation_compatibility": sam_info,
|
|
}
|
|
print(json.dumps(result, indent=2))
|
|
sys.exit(0)
|
|
|
|
# Human-readable output optimized for LLM parsing
|
|
print("=" * 80)
|
|
print(f"VISION DATASET INSPECTION")
|
|
print("=" * 80)
|
|
|
|
print(f"\nDataset: {args.dataset}")
|
|
print(f"Config: {config_to_use}")
|
|
print(f"Split: {args.split}")
|
|
print(f"Total examples: {total_examples}")
|
|
print(f"Samples fetched: {len(rows)}")
|
|
|
|
print(f"\n{'COLUMNS':-<80}")
|
|
if features:
|
|
for feature in features:
|
|
print(f" {feature['name']}: {feature['type']}")
|
|
else:
|
|
for col in columns:
|
|
print(f" {col}: (type info not available)")
|
|
|
|
print(f"\n{'EXAMPLE DATA':-<80}")
|
|
example = first_row
|
|
for col in columns:
|
|
value = example.get(col)
|
|
display = format_value_preview(value, args.preview)
|
|
print(f"\n{col}:")
|
|
print(f" {display}")
|
|
|
|
# --- Image Classification ---
|
|
print(f"\n{'IMAGE CLASSIFICATION COMPATIBILITY':-<80}")
|
|
print(f"\n[STATUS] {'✓ READY' if ic_info['ready'] else '✗ NOT COMPATIBLE'}")
|
|
|
|
print(f"\nImage Column:")
|
|
if ic_info["has_image"]:
|
|
print(f" ✓ Found: {', '.join(ic_info['image_columns'])}")
|
|
else:
|
|
print(f" ✗ No image column detected")
|
|
|
|
print(f"\nLabel Column:")
|
|
if ic_info["has_label"]:
|
|
print(f" ✓ Found: {', '.join(ic_info['label_columns'])}")
|
|
li = ic_info["label_info"]
|
|
if li.get("type"):
|
|
print(f" • Type: {li['type']}")
|
|
if li.get("num_classes"):
|
|
print(f" • Number of Classes: {li['num_classes']}")
|
|
if li.get("class_names"):
|
|
names = li["class_names"]
|
|
display = ", ".join(str(n) for n in names[:10])
|
|
if len(names) > 10:
|
|
display += f" ... ({li['num_classes']} total)"
|
|
print(f" • Classes: {display}")
|
|
elif li.get("sample_unique_labels"):
|
|
labels = li["sample_unique_labels"]
|
|
display = ", ".join(str(l) for l in labels[:10])
|
|
if li.get("sample_unique_count", 0) > 10:
|
|
display += f" ... ({li['sample_unique_count']}+ from sample)"
|
|
print(f" • Sample labels: {display}")
|
|
else:
|
|
print(f" ✗ No label column detected")
|
|
print(f" Expected column names: 'label', 'labels', 'class', 'fine_label'")
|
|
|
|
if ic_info["ready"]:
|
|
lc = ic_info["label_info"].get("column", "label")
|
|
print(f"\n Use with: scripts/image_classification_training.py")
|
|
print(f" --image_column_name {ic_info['image_columns'][0]} --label_column_name {lc}")
|
|
|
|
# --- Object Detection ---
|
|
print(f"\n{'OBJECT DETECTION COMPATIBILITY':-<80}")
|
|
print(f"\n[STATUS] {'✓ READY' if od_info['ready'] else '✗ NOT COMPATIBLE'}")
|
|
|
|
print(f"\nImage Column:")
|
|
if od_info["has_image"]:
|
|
print(f" ✓ Found: {', '.join(od_info['image_columns'])}")
|
|
else:
|
|
print(f" ✗ No image column detected")
|
|
print(f" Expected column names: 'image', 'img', 'picture', 'photo'")
|
|
|
|
print(f"\nAnnotations:")
|
|
if od_info["has_annotations"]:
|
|
print(f" ✓ Found: {', '.join(od_info['annotation_columns'])}")
|
|
ann_info = od_info["annotations_info"]
|
|
if ann_info.get("found"):
|
|
print(f"\n Annotation Details:")
|
|
print(f" • Column: {ann_info['column']}")
|
|
if ann_info.get("primary_bbox_format"):
|
|
print(f" • BBox Format: {ann_info['primary_bbox_format']}")
|
|
if ann_info.get("num_classes", 0) > 0:
|
|
print(f" • Number of Classes: {ann_info['num_classes']}")
|
|
print(f" • Classes: {', '.join(ann_info['categories_found'][:10])}")
|
|
if len(ann_info['categories_found']) > 10:
|
|
print(f" (showing first 10 of {len(ann_info['categories_found'])})")
|
|
print(f" • Avg Objects/Image: {ann_info['avg_objects_per_image']}")
|
|
print(f" • Min Objects: {ann_info['min_objects']}")
|
|
print(f" • Max Objects: {ann_info['max_objects']}")
|
|
elif od_info["separate_bbox_columns"] and od_info["separate_category_columns"]:
|
|
print(f" ⚠ Separate bbox and category columns found:")
|
|
print(f" BBox columns: {', '.join(od_info['separate_bbox_columns'])}")
|
|
print(f" Category columns: {', '.join(od_info['separate_category_columns'])}")
|
|
print(f" Action: These need to be combined (see mapping code below)")
|
|
else:
|
|
print(f" ✗ No annotation columns detected")
|
|
print(f" Expected: 'objects', 'annotations', 'bbox'/'bboxes' + 'category'/'label'")
|
|
|
|
# --- SAM Segmentation ---
|
|
print(f"\n{'SAM SEGMENTATION COMPATIBILITY':-<80}")
|
|
print(f"\n[STATUS] {'✓ READY' if sam_info['ready'] else '✗ NOT COMPATIBLE'}")
|
|
|
|
print(f"\nImage Column:")
|
|
if sam_info["has_image"]:
|
|
print(f" ✓ Found: {', '.join(sam_info['image_columns'])}")
|
|
else:
|
|
print(f" ✗ No image column detected")
|
|
|
|
print(f"\nMask Column:")
|
|
if sam_info["has_mask"]:
|
|
print(f" ✓ Found: {', '.join(sam_info['mask_columns'])}")
|
|
else:
|
|
print(f" ✗ No mask column detected")
|
|
print(f" Expected column names: 'mask', 'segmentation', 'alpha', 'matte'")
|
|
|
|
print(f"\nPrompt:")
|
|
pi = sam_info["prompt_info"]
|
|
if pi["has_prompt"]:
|
|
print(f" ✓ Type: {pi['prompt_type']} (from {pi['source']})")
|
|
if pi.get("bbox_valid"):
|
|
bv = pi["bbox_valid"]
|
|
if bv["valid"]:
|
|
print(f" • BBox values: {bv.get('values')}")
|
|
if bv.get("format_hint"):
|
|
print(f" • Format: {bv['format_hint']}")
|
|
if bv.get("warning"):
|
|
print(f" ⚠ {bv['warning']}")
|
|
else:
|
|
print(f" ✗ Invalid bbox: {bv.get('error', 'unknown error')}")
|
|
else:
|
|
print(f" ✗ No prompt detected")
|
|
print(f" Expected: 'prompt' column (JSON with bbox/point), or 'bbox'/'point' column")
|
|
|
|
if sam_info["ready"]:
|
|
pc = sam_info["prompt_columns"][0] if sam_info["prompt_columns"] else None
|
|
args_hint = f"--prompt_type {pi['prompt_type']}"
|
|
if pc:
|
|
args_hint += f" --prompt_column_name {pc}"
|
|
print(f"\n Use with: scripts/sam_segmentation_training.py")
|
|
print(f" {args_hint}")
|
|
|
|
# Mapping code (OD only)
|
|
mapping_code = generate_mapping_code(od_info)
|
|
|
|
if mapping_code:
|
|
print(f"\n{'OD PREPROCESSING CODE':-<80}")
|
|
print(mapping_code)
|
|
elif od_info["ready"]:
|
|
print(f"\n ✓ No OD preprocessing needed.")
|
|
|
|
# --- Summary ---
|
|
print(f"\n{'SUMMARY':-<80}")
|
|
if ic_info["ready"]:
|
|
num_cls = ic_info["label_info"].get("num_classes") or ic_info["label_info"].get("sample_unique_count", "?")
|
|
print(f"✓ Image Classification: READY ({num_cls} classes)")
|
|
else:
|
|
print(f"✗ Image Classification: not compatible")
|
|
|
|
if od_info["ready"]:
|
|
ann_info = od_info["annotations_info"]
|
|
fmt = ann_info.get("primary_bbox_format", "")
|
|
cls = ann_info.get("num_classes", "?")
|
|
print(f"✓ Object Detection: READY ({cls} classes, {fmt})")
|
|
else:
|
|
print(f"✗ Object Detection: not compatible")
|
|
|
|
if sam_info["ready"]:
|
|
print(f"✓ SAM Segmentation: READY (prompt: {pi['prompt_type']})")
|
|
else:
|
|
print(f"✗ SAM Segmentation: not compatible")
|
|
|
|
print(f"\nNote: Used Datasets Server API (instant, no download required)")
|
|
|
|
print("\n" + "=" * 80)
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print(f"ERROR: {e}", file=sys.stderr)
|
|
sys.exit(1)
|