fix(skill-tester): make Security dimension opt-in with --include-security flag

- Add --include-security flag to quality_scorer.py
- Default: 4 dimensions × 25% (backward compatible)
- With --include-security: 5 dimensions × 20%
- Update tier recommendation logic for optional Security
- Update documentation to reflect opt-in behavior

This addresses the breaking change concern from PR review:
the weight change from 25% to 20% would affect all existing
audit baselines. The new opt-in approach preserves backward
compatibility.
This commit is contained in:
xingzihai
2026-03-27 10:05:12 +00:00
parent 2f92a1dfcb
commit e0e683ee5e
3 changed files with 129 additions and 56 deletions

View File

@@ -13,7 +13,9 @@ Author: Claude Skills Engineering Team
Version: 2.0.0
Dependencies: Python Standard Library Only
Changelog:
v2.0.0 - Added Security dimension (20% weight), rebalanced all dimensions to 20%
v2.0.0 - Added Security dimension (opt-in via --include-security flag)
Default: 4 dimensions × 25% (backward compatible)
With --include-security: 5 dimensions × 20%
v1.0.0 - Initial release with 4 dimensions (25% each)
"""
@@ -156,21 +158,35 @@ class QualityReport:
usability_score = self.dimensions.get("Usability", QualityDimension("", 0, "")).score
security_score = self.dimensions.get("Security", QualityDimension("", 0, "")).score
# POWERFUL tier requirements (all dimensions must be strong)
if (self.overall_score >= 80 and
all(score >= 75 for score in [doc_score, code_score, completeness_score, usability_score]) and
security_score >= 70):
self.tier_recommendation = "POWERFUL"
# STANDARD tier requirements (most dimensions good)
elif (self.overall_score >= 70 and
sum(1 for score in [doc_score, code_score, completeness_score, usability_score, security_score] if score >= 65) >= 4 and
security_score >= 50):
self.tier_recommendation = "STANDARD"
# BASIC tier (minimum viable quality)
# Check if Security dimension is included
has_security = "Security" in self.dimensions
# POWERFUL tier requirements
if has_security:
# With Security: all 5 dimensions must be strong
if (self.overall_score >= 80 and
all(score >= 75 for score in [doc_score, code_score, completeness_score, usability_score]) and
security_score >= 70):
self.tier_recommendation = "POWERFUL"
# STANDARD tier requirements (with Security)
elif (self.overall_score >= 70 and
sum(1 for score in [doc_score, code_score, completeness_score, usability_score, security_score] if score >= 65) >= 4 and
security_score >= 50):
self.tier_recommendation = "STANDARD"
else:
self.tier_recommendation = "BASIC"
# Without Security: 4 dimensions must be strong
if (self.overall_score >= 80 and
all(score >= 75 for score in [doc_score, code_score, completeness_score, usability_score])):
self.tier_recommendation = "POWERFUL"
# STANDARD tier requirements (without Security)
elif (self.overall_score >= 70 and
sum(1 for score in [doc_score, code_score, completeness_score, usability_score] if score >= 65) >= 3):
self.tier_recommendation = "STANDARD"
# BASIC tier (minimum viable quality)
# Falls through to BASIC if no other tier matched
def _generate_improvement_roadmap(self):
"""Generate prioritized improvement suggestions"""
@@ -209,10 +225,11 @@ class QualityReport:
class QualityScorer:
"""Main quality scoring engine"""
def __init__(self, skill_path: str, detailed: bool = False, verbose: bool = False):
def __init__(self, skill_path: str, detailed: bool = False, verbose: bool = False, include_security: bool = False):
self.skill_path = Path(skill_path).resolve()
self.detailed = detailed
self.verbose = verbose
self.include_security = include_security
self.report = QualityReport(str(self.skill_path))
def log_verbose(self, message: str):
@@ -229,12 +246,20 @@ class QualityScorer:
if not self.skill_path.exists():
raise ValueError(f"Skill path does not exist: {self.skill_path}")
# Score each dimension (20% weight each, 5 dimensions total)
self._score_documentation()
self._score_code_quality()
self._score_completeness()
self._score_security()
self._score_usability()
# Score each dimension
# Default: 4 dimensions at 25% each (backward compatible)
# With --include-security: 5 dimensions at 20% each
weight = 0.20 if self.include_security else 0.25
self._score_documentation(weight)
self._score_code_quality(weight)
self._score_completeness(weight)
if self.include_security:
self._score_security(0.20)
self._score_usability(0.20)
else:
self._score_usability(0.25)
# Calculate overall metrics
self.report.calculate_overall_score()
@@ -247,11 +272,11 @@ class QualityScorer:
return self.report
def _score_documentation(self):
"""Score documentation quality (25% weight)"""
def _score_documentation(self, weight: float = 0.25):
"""Score documentation quality"""
self.log_verbose("Scoring documentation quality...")
dimension = QualityDimension("Documentation", 0.20, "Quality of documentation and written materials")
dimension = QualityDimension("Documentation", weight, "Quality of documentation and written materials")
# Score SKILL.md
self._score_skill_md(dimension)
@@ -501,11 +526,11 @@ class QualityScorer:
dimension.add_score("examples_availability", score, 25,
f"Found {len(example_files)} example/sample files")
def _score_code_quality(self):
"""Score code quality (25% weight)"""
def _score_code_quality(self, weight: float = 0.25):
"""Score code quality"""
self.log_verbose("Scoring code quality...")
dimension = QualityDimension("Code Quality", 0.20, "Quality of Python scripts and implementation")
dimension = QualityDimension("Code Quality", weight, "Quality of Python scripts and implementation")
scripts_dir = self.skill_path / "scripts"
if not scripts_dir.exists():
@@ -688,11 +713,11 @@ class QualityScorer:
if avg_output_score < 15:
dimension.add_suggestion("Add support for both JSON and human-readable output formats")
def _score_completeness(self):
"""Score completeness (25% weight)"""
def _score_completeness(self, weight: float = 0.25):
"""Score completeness"""
self.log_verbose("Scoring completeness...")
dimension = QualityDimension("Completeness", 0.20, "Completeness of required components and assets")
dimension = QualityDimension("Completeness", weight, "Completeness of required components and assets")
# Score directory structure
self._score_directory_structure(dimension)
@@ -810,11 +835,11 @@ class QualityScorer:
if not test_files:
dimension.add_suggestion("Add test files or validation scripts")
def _score_usability(self):
"""Score usability (25% weight)"""
def _score_usability(self, weight: float = 0.25):
"""Score usability"""
self.log_verbose("Scoring usability...")
dimension = QualityDimension("Usability", 0.20, "Ease of use and user experience")
dimension = QualityDimension("Usability", weight, "Ease of use and user experience")
# Score installation simplicity
self._score_installation(dimension)
@@ -946,11 +971,11 @@ class QualityScorer:
dimension.add_score("practical_examples", score, 25,
f"Practical examples: {len(example_files)} files")
def _score_security(self):
"""Score security quality (20% weight)"""
def _score_security(self, weight: float = 0.20):
"""Score security quality"""
self.log_verbose("Scoring security quality...")
dimension = QualityDimension("Security", 0.20, "Security practices and vulnerability prevention")
dimension = QualityDimension("Security", weight, "Security practices and vulnerability prevention")
# Find Python scripts
python_files = list(self.skill_path.rglob("*.py"))
@@ -1078,13 +1103,17 @@ Examples:
python quality_scorer.py engineering/my-skill
python quality_scorer.py engineering/my-skill --detailed --json
python quality_scorer.py engineering/my-skill --minimum-score 75
python quality_scorer.py engineering/my-skill --include-security
Quality Dimensions (each 25%):
Quality Dimensions (default: 4 dimensions × 25%):
Documentation - SKILL.md quality, README, references, examples
Code Quality - Script complexity, error handling, structure, output
Completeness - Directory structure, assets, expected outputs, tests
Usability - Installation simplicity, usage clarity, help accessibility
With --include-security (5 dimensions × 20%):
Security - Sensitive data exposure, command injection, input validation
Letter Grades: A+ (95+), A (90+), A- (85+), B+ (80+), B (75+), B- (70+), C+ (65+), C (60+), C- (55+), D (50+), F (<50)
"""
)
@@ -1104,12 +1133,15 @@ Letter Grades: A+ (95+), A (90+), A- (85+), B+ (80+), B (75+), B- (70+), C+ (65+
parser.add_argument("--verbose",
action="store_true",
help="Enable verbose logging")
parser.add_argument("--include-security",
action="store_true",
help="Include Security dimension (switches to 5 dimensions × 20%% each)")
args = parser.parse_args()
try:
# Create scorer and assess quality
scorer = QualityScorer(args.skill_path, args.detailed, args.verbose)
scorer = QualityScorer(args.skill_path, args.detailed, args.verbose, args.include_security)
report = scorer.assess_quality()
# Format and output report