Files
antigravity-skills-reference/skills/hugging-face-vision-trainer/scripts/image_classification_training.py
sickn33 bdcfbb9625 feat(hugging-face): Add official ecosystem skills
Import the official Hugging Face ecosystem skills and sync the\nexisting local coverage with upstream metadata and assets.\n\nRegenerate the canonical catalog, plugin mirrors, docs, and release\nnotes after the maintainer merge batch so main stays in sync.\n\nFixes #417
2026-03-29 18:31:46 +02:00

384 lines
13 KiB
Python

# /// script
# dependencies = [
# "transformers>=5.2.0",
# "accelerate>=1.1.0",
# "timm",
# "datasets>=4.0",
# "evaluate",
# "scikit-learn",
# "torchvision",
# "trackio",
# "huggingface_hub",
# ]
# ///
"""Fine-tuning any Transformers or timm model supported by AutoModelForImageClassification using the Trainer API."""
import logging
import os
import sys
from dataclasses import dataclass, field
from functools import partial
from typing import Any
import evaluate
import numpy as np
import torch
from datasets import load_dataset
from torchvision.transforms import (
CenterCrop,
Compose,
Normalize,
RandomHorizontalFlip,
RandomResizedCrop,
Resize,
ToTensor,
)
import trackio
import transformers
from transformers import (
AutoConfig,
AutoImageProcessor,
AutoModelForImageClassification,
DefaultDataCollator,
HfArgumentParser,
Trainer,
TrainingArguments,
)
from transformers.trainer import EvalPrediction
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
check_min_version("4.57.0.dev0")
require_version("datasets>=2.0.0")
@dataclass
class DataTrainingArguments:
dataset_name: str = field(
default="ethz/food101",
metadata={"help": "Name of a dataset from the Hub."},
)
dataset_config_name: str | None = field(
default=None,
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
)
train_val_split: float | None = field(
default=0.15,
metadata={"help": "Fraction to split off of train for validation (used only when no validation split exists)."},
)
max_train_samples: int | None = field(
default=None,
metadata={"help": "Truncate training set to this many samples (for debugging / quick tests)."},
)
max_eval_samples: int | None = field(
default=None,
metadata={"help": "Truncate evaluation set to this many samples."},
)
image_column_name: str = field(
default="image",
metadata={"help": "The column name for images in the dataset."},
)
label_column_name: str = field(
default="label",
metadata={"help": "The column name for labels in the dataset."},
)
@dataclass
class ModelArguments:
model_name_or_path: str = field(
default="timm/mobilenetv3_small_100.lamb_in1k",
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."},
)
config_name: str | None = field(
default=None,
metadata={"help": "Pretrained config name or path if not the same as model_name."},
)
cache_dir: str | None = field(
default=None,
metadata={"help": "Where to store pretrained models downloaded from the Hub."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (branch, tag, or commit id)."},
)
image_processor_name: str | None = field(
default=None,
metadata={"help": "Name or path of image processor config."},
)
ignore_mismatched_sizes: bool = field(
default=True,
metadata={"help": "Allow loading weights when num_labels differs from pretrained checkpoint."},
)
token: str | None = field(
default=None,
metadata={"help": "Auth token for private models / datasets."},
)
trust_remote_code: bool = field(
default=False,
metadata={"help": "Whether to trust remote code from Hub repos."},
)
def build_transforms(image_processor, is_training: bool):
"""Build torchvision transforms from the image processor's config."""
if hasattr(image_processor, "size"):
size = image_processor.size
if "shortest_edge" in size:
img_size = size["shortest_edge"]
elif "height" in size and "width" in size:
img_size = (size["height"], size["width"])
else:
img_size = 224
else:
img_size = 224
if hasattr(image_processor, "image_mean") and image_processor.image_mean:
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
else:
normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
if is_training:
return Compose([
RandomResizedCrop(img_size),
RandomHorizontalFlip(),
ToTensor(),
normalize,
])
else:
if isinstance(img_size, int):
resize_size = int(img_size / 0.875) # standard 87.5% center crop ratio
else:
resize_size = tuple(int(s / 0.875) for s in img_size)
return Compose([
Resize(resize_size),
CenterCrop(img_size),
ToTensor(),
normalize,
])
def main():
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# --- Hub authentication ---
from huggingface_hub import login
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("hfjob")
if hf_token:
login(token=hf_token)
training_args.hub_token = hf_token
logger.info("Logged in to Hugging Face Hub")
elif training_args.push_to_hub:
logger.warning("HF_TOKEN not found in environment. Hub push will likely fail.")
# --- Trackio ---
trackio.init(project=training_args.output_dir, name=training_args.run_name)
# --- Logging ---
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
f"Process rank: {training_args.local_process_index}, device: {training_args.device}, "
f"n_gpu: {training_args.n_gpu}, distributed training: "
f"{training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# --- Load dataset ---
dataset = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
# --- Resolve label column ---
label_col = data_args.label_column_name
if label_col not in dataset["train"].column_names:
candidates = [c for c in dataset["train"].column_names if c in ("label", "labels", "class", "fine_label")]
if candidates:
label_col = candidates[0]
logger.info(f"Label column '{data_args.label_column_name}' not found, using '{label_col}'")
else:
raise ValueError(
f"Label column '{data_args.label_column_name}' not found. "
f"Available columns: {dataset['train'].column_names}"
)
# --- Discover labels ---
label_feature = dataset["train"].features[label_col]
if hasattr(label_feature, "names"):
label_names = label_feature.names
else:
unique_labels = sorted(set(dataset["train"][label_col]))
if all(isinstance(l, str) for l in unique_labels):
label_names = unique_labels
else:
label_names = [str(l) for l in unique_labels]
num_labels = len(label_names)
id2label = dict(enumerate(label_names))
label2id = {v: k for k, v in id2label.items()}
logger.info(f"Number of classes: {num_labels}")
# --- Remap string labels to int if needed ---
sample_label = dataset["train"][0][label_col]
if isinstance(sample_label, str):
logger.info("Remapping string labels to integer IDs")
for split_name in list(dataset.keys()):
dataset[split_name] = dataset[split_name].map(
lambda ex: {label_col: label2id[ex[label_col]]},
)
# --- Shuffle + Train/val split ---
dataset["train"] = dataset["train"].shuffle(seed=training_args.seed)
data_args.train_val_split = None if "validation" in dataset else data_args.train_val_split
if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
split = dataset["train"].train_test_split(data_args.train_val_split, seed=training_args.seed)
dataset["train"] = split["train"]
dataset["validation"] = split["test"]
# --- Truncate ---
if data_args.max_train_samples is not None:
max_train = min(data_args.max_train_samples, len(dataset["train"]))
dataset["train"] = dataset["train"].select(range(max_train))
logger.info(f"Truncated training set to {max_train} samples")
if data_args.max_eval_samples is not None and "validation" in dataset:
max_eval = min(data_args.max_eval_samples, len(dataset["validation"]))
dataset["validation"] = dataset["validation"].select(range(max_eval))
logger.info(f"Truncated validation set to {max_eval} samples")
# --- Load model & image processor ---
common_pretrained_args = {
"cache_dir": model_args.cache_dir,
"revision": model_args.model_revision,
"token": model_args.token,
"trust_remote_code": model_args.trust_remote_code,
}
config = AutoConfig.from_pretrained(
model_args.config_name or model_args.model_name_or_path,
num_labels=num_labels,
label2id=label2id,
id2label=id2label,
**common_pretrained_args,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
config=config,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
**common_pretrained_args,
)
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
**common_pretrained_args,
)
# --- Build transforms ---
train_transforms = build_transforms(image_processor, is_training=True)
val_transforms = build_transforms(image_processor, is_training=False)
image_col = data_args.image_column_name
def preprocess_train(examples):
return {
"pixel_values": [train_transforms(img.convert("RGB")) for img in examples[image_col]],
"labels": examples[label_col],
}
def preprocess_val(examples):
return {
"pixel_values": [val_transforms(img.convert("RGB")) for img in examples[image_col]],
"labels": examples[label_col],
}
dataset["train"].set_transform(preprocess_train)
if "validation" in dataset:
dataset["validation"].set_transform(preprocess_val)
if "test" in dataset:
dataset["test"].set_transform(preprocess_val)
# --- Metrics ---
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred: EvalPrediction):
predictions = np.argmax(eval_pred.predictions, axis=1)
return accuracy_metric.compute(predictions=predictions, references=eval_pred.label_ids)
# --- Trainer ---
eval_dataset = None
if training_args.do_eval:
if "validation" in dataset:
eval_dataset = dataset["validation"]
elif "test" in dataset:
eval_dataset = dataset["test"]
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"] if training_args.do_train else None,
eval_dataset=eval_dataset,
processing_class=image_processor,
data_collator=DefaultDataCollator(),
compute_metrics=compute_metrics,
)
# --- Train ---
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
# --- Evaluate ---
if training_args.do_eval:
test_dataset = dataset.get("test", dataset.get("validation"))
test_prefix = "test" if "test" in dataset else "eval"
if test_dataset is not None:
metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix=test_prefix)
trainer.log_metrics(test_prefix, metrics)
trainer.save_metrics(test_prefix, metrics)
trackio.finish()
# --- Push to Hub ---
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"dataset": data_args.dataset_name,
"tags": ["image-classification", "vision"],
}
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
if __name__ == "__main__":
main()