Files
antigravity-skills-reference/skills/hugging-face-vision-trainer/scripts/sam_segmentation_training.py
sickn33 bdcfbb9625 feat(hugging-face): Add official ecosystem skills
Import the official Hugging Face ecosystem skills and sync the\nexisting local coverage with upstream metadata and assets.\n\nRegenerate the canonical catalog, plugin mirrors, docs, and release\nnotes after the maintainer merge batch so main stays in sync.\n\nFixes #417
2026-03-29 18:31:46 +02:00

383 lines
14 KiB
Python

# /// script
# dependencies = [
# "transformers>=5.2.0",
# "accelerate>=1.1.0",
# "datasets>=4.0",
# "torchvision",
# "monai",
# "trackio",
# "huggingface_hub",
# ]
# ///
"""Fine-tune SAM or SAM2 for segmentation using bounding-box or point prompts with the HF Trainer API."""
import json
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from typing import Any
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import Dataset
import monai
import trackio
import transformers
from transformers import (
HfArgumentParser,
Trainer,
TrainingArguments,
)
from transformers.utils import check_min_version
logger = logging.getLogger(__name__)
check_min_version("4.57.0.dev0")
# ---------------------------------------------------------------------------
# Dataset wrapper
# ---------------------------------------------------------------------------
class SAMSegmentationDataset(Dataset):
"""Wraps a HF dataset into the format expected by SAM/SAM2 processors.
Each sample must contain an image, a binary mask, and a prompt (bbox or
point). Prompts are read from a JSON-encoded ``prompt`` column or from
dedicated ``bbox`` / ``point`` columns.
"""
def __init__(self, dataset, processor, prompt_type: str,
image_col: str, mask_col: str, prompt_col: str | None,
bbox_col: str | None, point_col: str | None):
self.dataset = dataset
self.processor = processor
self.prompt_type = prompt_type
self.image_col = image_col
self.mask_col = mask_col
self.prompt_col = prompt_col
self.bbox_col = bbox_col
self.point_col = point_col
def __len__(self):
return len(self.dataset)
def _extract_prompt(self, item):
if self.prompt_col and self.prompt_col in item:
raw = item[self.prompt_col]
parsed = json.loads(raw) if isinstance(raw, str) else raw
if self.prompt_type == "bbox":
return parsed.get("bbox") or parsed.get("box")
return parsed.get("point") or parsed.get("points")
if self.prompt_type == "bbox" and self.bbox_col:
return item[self.bbox_col]
if self.prompt_type == "point" and self.point_col:
return item[self.point_col]
raise ValueError("Could not extract prompt from sample")
def __getitem__(self, idx):
item = self.dataset[idx]
image = item[self.image_col]
prompt = self._extract_prompt(item)
if self.prompt_type == "bbox":
inputs = self.processor(image, input_boxes=[[prompt]], return_tensors="pt")
else:
if isinstance(prompt[0], (int, float)):
prompt = [prompt]
inputs = self.processor(image, input_points=[[prompt]], return_tensors="pt")
mask = np.array(item[self.mask_col])
if mask.ndim == 3:
mask = mask[:, :, 0]
inputs["labels"] = (mask > 0).astype(np.float32)
inputs["original_image_size"] = torch.tensor(image.size[::-1])
return inputs
def collate_fn(batch):
pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
original_sizes = torch.stack([item["original_sizes"] for item in batch])
original_image_size = torch.stack([item["original_image_size"] for item in batch])
has_boxes = "input_boxes" in batch[0]
has_points = "input_points" in batch[0]
labels = torch.cat(
[
F.interpolate(
torch.as_tensor(x["labels"]).unsqueeze(0).unsqueeze(0).float(),
size=(256, 256),
mode="nearest",
)
for x in batch
],
dim=0,
).long()
result = {
"pixel_values": pixel_values,
"original_sizes": original_sizes,
"labels": labels,
"original_image_size": original_image_size,
"multimask_output": False,
}
if has_boxes:
result["input_boxes"] = torch.cat([item["input_boxes"] for item in batch], dim=0)
if has_points:
result["input_points"] = torch.cat([item["input_points"] for item in batch], dim=0)
if "input_labels" in batch[0]:
result["input_labels"] = torch.cat([item["input_labels"] for item in batch], dim=0)
return result
# ---------------------------------------------------------------------------
# Custom loss (SAM/SAM2 don't compute loss in forward())
# ---------------------------------------------------------------------------
seg_loss = monai.losses.DiceCELoss(sigmoid=True, squared_pred=True, reduction="mean")
def compute_loss(outputs, labels, num_items_in_batch=None):
predicted_masks = outputs.pred_masks.squeeze(1)
return seg_loss(predicted_masks, labels.float())
# ---------------------------------------------------------------------------
# CLI arguments
# ---------------------------------------------------------------------------
@dataclass
class DataTrainingArguments:
dataset_name: str = field(
default="merve/MicroMat-mini",
metadata={"help": "Hub dataset ID."},
)
dataset_config_name: str | None = field(
default=None,
metadata={"help": "Dataset config name."},
)
train_val_split: float | None = field(
default=0.1,
metadata={"help": "Fraction to split off for validation (used when no validation split exists)."},
)
max_train_samples: int | None = field(
default=None,
metadata={"help": "Truncate training set (for quick tests)."},
)
max_eval_samples: int | None = field(
default=None,
metadata={"help": "Truncate evaluation set."},
)
image_column_name: str = field(
default="image",
metadata={"help": "Column containing PIL images."},
)
mask_column_name: str = field(
default="mask",
metadata={"help": "Column containing ground-truth binary masks."},
)
prompt_column_name: str | None = field(
default="prompt",
metadata={"help": "Column with JSON-encoded prompt (bbox/point). Set to '' to disable."},
)
bbox_column_name: str | None = field(
default=None,
metadata={"help": "Column with bbox prompt ([x0,y0,x1,y1]). Used when prompt_column_name is unset."},
)
point_column_name: str | None = field(
default=None,
metadata={"help": "Column with point prompt ([x,y] or [[x,y],...]). Used when prompt_column_name is unset."},
)
prompt_type: str = field(
default="bbox",
metadata={"help": "Prompt type: 'bbox' or 'point'."},
)
@dataclass
class ModelArguments:
model_name_or_path: str = field(
default="facebook/sam2.1-hiera-small",
metadata={"help": "Pretrained SAM/SAM2 model identifier."},
)
cache_dir: str | None = field(default=None, metadata={"help": "Cache directory."})
model_revision: str = field(default="main", metadata={"help": "Model revision."})
token: str | None = field(default=None, metadata={"help": "Auth token."})
trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code."})
freeze_vision_encoder: bool = field(
default=True,
metadata={"help": "Freeze vision encoder weights."},
)
freeze_prompt_encoder: bool = field(
default=True,
metadata={"help": "Freeze prompt encoder weights."},
)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
parser.set_defaults(per_device_train_batch_size=4, num_train_epochs=30)
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(sys.argv[1])
)
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
from huggingface_hub import login
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("hfjob")
if hf_token:
login(token=hf_token)
training_args.hub_token = hf_token
logger.info("Logged in to Hugging Face Hub")
elif training_args.push_to_hub:
logger.warning("HF_TOKEN not found in environment. Hub push will likely fail.")
trackio.init(project=training_args.output_dir, name=training_args.run_name)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")
# ---- Load dataset ----
dataset = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
if "train" not in dataset:
if len(dataset.keys()) == 1:
only_split = list(dataset.keys())[0]
dataset[only_split] = dataset[only_split].shuffle(seed=training_args.seed)
dataset = dataset[only_split].train_test_split(test_size=data_args.train_val_split or 0.1)
dataset = {"train": dataset["train"], "validation": dataset["test"]}
else:
raise ValueError(f"No 'train' split found. Available: {list(dataset.keys())}")
elif "validation" not in dataset and "test" not in dataset:
dataset["train"] = dataset["train"].shuffle(seed=training_args.seed)
split = dataset["train"].train_test_split(
test_size=data_args.train_val_split or 0.1, seed=training_args.seed
)
dataset["train"] = split["train"]
dataset["validation"] = split["test"]
if data_args.max_train_samples is not None:
n = min(data_args.max_train_samples, len(dataset["train"]))
dataset["train"] = dataset["train"].select(range(n))
logger.info(f"Truncated training set to {n} samples")
eval_key = "validation" if "validation" in dataset else "test"
if data_args.max_eval_samples is not None and eval_key in dataset:
n = min(data_args.max_eval_samples, len(dataset[eval_key]))
dataset[eval_key] = dataset[eval_key].select(range(n))
logger.info(f"Truncated eval set to {n} samples")
# ---- Detect model family (SAM vs SAM2) and load processor/model ----
model_id = model_args.model_name_or_path.lower()
is_sam2 = "sam2" in model_id
if is_sam2:
from transformers import Sam2Processor, Sam2Model
processor = Sam2Processor.from_pretrained(model_args.model_name_or_path)
model = Sam2Model.from_pretrained(model_args.model_name_or_path)
else:
from transformers import SamProcessor, SamModel
processor = SamProcessor.from_pretrained(model_args.model_name_or_path)
model = SamModel.from_pretrained(model_args.model_name_or_path)
if model_args.freeze_vision_encoder:
for name, param in model.named_parameters():
if name.startswith("vision_encoder"):
param.requires_grad_(False)
if model_args.freeze_prompt_encoder:
for name, param in model.named_parameters():
if name.startswith("prompt_encoder"):
param.requires_grad_(False)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
logger.info(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.1f}%)")
# ---- Build datasets ----
prompt_col = data_args.prompt_column_name if data_args.prompt_column_name else None
ds_kwargs = dict(
processor=processor,
prompt_type=data_args.prompt_type,
image_col=data_args.image_column_name,
mask_col=data_args.mask_column_name,
prompt_col=prompt_col,
bbox_col=data_args.bbox_column_name,
point_col=data_args.point_column_name,
)
train_dataset = SAMSegmentationDataset(dataset=dataset["train"], **ds_kwargs)
eval_dataset = None
if eval_key in dataset:
eval_dataset = SAMSegmentationDataset(dataset=dataset[eval_key], **ds_kwargs)
# ---- Train ----
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
data_collator=collate_fn,
compute_loss_func=compute_loss,
)
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
if training_args.do_eval and eval_dataset is not None:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
trackio.finish()
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"dataset": data_args.dataset_name,
"tags": ["image-segmentation", "vision", "sam"],
}
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
if __name__ == "__main__":
main()