claude-skills-reference/engineering/skill-tester/scripts/quality_scorer.py

#!/usr/bin/env python3
"""
Quality Scorer - Scores skills across multiple quality dimensions

This script provides comprehensive quality assessment for skills in the claude-skills
ecosystem by evaluating documentation, code quality, completeness, security, and usability.
Generates letter grades, tier recommendations, and improvement roadmaps.

Usage:
    python quality_scorer.py <skill_path> [--detailed] [--minimum-score SCORE] [--json]

Author: Claude Skills Engineering Team
Version: 2.0.0
Dependencies: Python Standard Library Only
Changelog:
  v2.0.0 - Added Security dimension (20% weight), rebalanced all dimensions to 20%
  v1.0.0 - Initial release with 4 dimensions (25% each)
"""

import argparse
import ast
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple

# Import Security Scorer module
from security_scorer import SecurityScorer
try:
    import yaml
except ImportError:
    # Minimal YAML subset: parse simple key: value frontmatter without pyyaml
    class _YamlStub:
        class YAMLError(Exception):
            pass
        @staticmethod
        def safe_load(text):
            result = {}
            for line in text.strip().splitlines():
                if ':' in line:
                    key, _, value = line.partition(':')
                    result[key.strip()] = value.strip()
            return result if result else None
    yaml = _YamlStub()


class QualityDimension:
    """Represents a quality scoring dimension"""

    def __init__(self, name: str, weight: float, description: str):
        self.name = name
        self.weight = weight
        self.description = description
        self.score = 0.0
        self.max_score = 100.0
        self.details = {}
        self.suggestions = []

    def add_score(self, component: str, score: float, max_score: float, details: str = ""):
        """Add a component score"""
        self.details[component] = {
            "score": score,
            "max_score": max_score,
            "percentage": (score / max_score * 100) if max_score > 0 else 0,
            "details": details
        }

    def calculate_final_score(self):
        """Calculate the final weighted score for this dimension"""
        if not self.details:
            self.score = 0.0
            return

        total_score = sum(detail["score"] for detail in self.details.values())
        total_max = sum(detail["max_score"] for detail in self.details.values())

        self.score = (total_score / total_max * 100) if total_max > 0 else 0.0

    def add_suggestion(self, suggestion: str):
        """Add an improvement suggestion"""
        self.suggestions.append(suggestion)


class QualityReport:
    """Container for quality assessment results"""

    def __init__(self, skill_path: str):
        self.skill_path = skill_path
        self.timestamp = datetime.utcnow().isoformat() + "Z"
        self.dimensions = {}
        self.overall_score = 0.0
        self.letter_grade = "F"
        self.tier_recommendation = "BASIC"
        self.improvement_roadmap = []
        self.summary_stats = {}

    def add_dimension(self, dimension: QualityDimension):
        """Add a quality dimension"""
        self.dimensions[dimension.name] = dimension

    def calculate_overall_score(self):
        """Calculate overall weighted score"""
        if not self.dimensions:
            return

        total_weighted_score = 0.0
        total_weight = 0.0

        for dimension in self.dimensions.values():
            total_weighted_score += dimension.score * dimension.weight
            total_weight += dimension.weight

        self.overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0

        # Calculate letter grade
        if self.overall_score >= 95:
            self.letter_grade = "A+"
        elif self.overall_score >= 90:
            self.letter_grade = "A"
        elif self.overall_score >= 85:
            self.letter_grade = "A-"
        elif self.overall_score >= 80:
            self.letter_grade = "B+"
        elif self.overall_score >= 75:
            self.letter_grade = "B"
        elif self.overall_score >= 70:
            self.letter_grade = "B-"
        elif self.overall_score >= 65:
            self.letter_grade = "C+"
        elif self.overall_score >= 60:
            self.letter_grade = "C"
        elif self.overall_score >= 55:
            self.letter_grade = "C-"
        elif self.overall_score >= 50:
            self.letter_grade = "D"
        else:
            self.letter_grade = "F"

        # Recommend tier based on overall score and specific criteria
        self._calculate_tier_recommendation()

        # Generate improvement roadmap
        self._generate_improvement_roadmap()

        # Calculate summary statistics
        self._calculate_summary_stats()

    def _calculate_tier_recommendation(self):
        """Calculate recommended tier based on quality scores"""
        doc_score = self.dimensions.get("Documentation", QualityDimension("", 0, "")).score
        code_score = self.dimensions.get("Code Quality", QualityDimension("", 0, "")).score
        completeness_score = self.dimensions.get("Completeness", QualityDimension("", 0, "")).score
        usability_score = self.dimensions.get("Usability", QualityDimension("", 0, "")).score
        security_score = self.dimensions.get("Security", QualityDimension("", 0, "")).score

        # POWERFUL tier requirements (all dimensions must be strong)
        if (self.overall_score >= 80 and
            all(score >= 75 for score in [doc_score, code_score, completeness_score, usability_score]) and
            security_score >= 70):
            self.tier_recommendation = "POWERFUL"

        # STANDARD tier requirements (most dimensions good)
        elif (self.overall_score >= 70 and
              sum(1 for score in [doc_score, code_score, completeness_score, usability_score, security_score] if score >= 65) >= 4 and
              security_score >= 50):
            self.tier_recommendation = "STANDARD"

        # BASIC tier (minimum viable quality)
        else:
            self.tier_recommendation = "BASIC"

    def _generate_improvement_roadmap(self):
        """Generate prioritized improvement suggestions"""
        all_suggestions = []

        # Collect suggestions from all dimensions with scores
        for dim_name, dimension in self.dimensions.items():
            for suggestion in dimension.suggestions:
                priority = "HIGH" if dimension.score < 60 else "MEDIUM" if dimension.score < 75 else "LOW"
                all_suggestions.append({
                    "priority": priority,
                    "dimension": dim_name,
                    "suggestion": suggestion,
                    "current_score": dimension.score
                })

        # Sort by priority and score
        priority_order = {"HIGH": 0, "MEDIUM": 1, "LOW": 2}
        all_suggestions.sort(key=lambda x: (priority_order[x["priority"]], x["current_score"]))

        self.improvement_roadmap = all_suggestions[:10]  # Top 10 suggestions

    def _calculate_summary_stats(self):
        """Calculate summary statistics"""
        scores = [dim.score for dim in self.dimensions.values()]

        self.summary_stats = {
            "highest_dimension": max(self.dimensions.items(), key=lambda x: x[1].score)[0] if scores else "None",
            "lowest_dimension": min(self.dimensions.items(), key=lambda x: x[1].score)[0] if scores else "None",
            "score_variance": sum((score - self.overall_score) ** 2 for score in scores) / len(scores) if scores else 0,
            "dimensions_above_70": sum(1 for score in scores if score >= 70),
            "dimensions_below_50": sum(1 for score in scores if score < 50)
        }


class QualityScorer:
    """Main quality scoring engine"""

    def __init__(self, skill_path: str, detailed: bool = False, verbose: bool = False):
        self.skill_path = Path(skill_path).resolve()
        self.detailed = detailed
        self.verbose = verbose
        self.report = QualityReport(str(self.skill_path))

    def log_verbose(self, message: str):
        """Log verbose message if verbose mode enabled"""
        if self.verbose:
            print(f"[VERBOSE] {message}", file=sys.stderr)

    def assess_quality(self) -> QualityReport:
        """Main quality assessment entry point"""
        try:
            self.log_verbose(f"Starting quality assessment for {self.skill_path}")

            # Check if skill path exists
            if not self.skill_path.exists():
                raise ValueError(f"Skill path does not exist: {self.skill_path}")

            # Score each dimension (20% weight each, 5 dimensions total)
            self._score_documentation()
            self._score_code_quality()
            self._score_completeness()
            self._score_security()
            self._score_usability()

            # Calculate overall metrics
            self.report.calculate_overall_score()

            self.log_verbose(f"Quality assessment completed. Overall score: {self.report.overall_score:.1f}")

        except Exception as e:
            print(f"Quality assessment failed: {str(e)}", file=sys.stderr)
            raise

        return self.report

    def _score_documentation(self):
        """Score documentation quality (25% weight)"""
        self.log_verbose("Scoring documentation quality...")

        dimension = QualityDimension("Documentation", 0.20, "Quality of documentation and written materials")

        # Score SKILL.md
        self._score_skill_md(dimension)

        # Score README.md
        self._score_readme(dimension)

        # Score reference documentation
        self._score_references(dimension)

        # Score examples and usage clarity
        self._score_examples(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_skill_md(self, dimension: QualityDimension):
        """Score SKILL.md quality"""
        skill_md_path = self.skill_path / "SKILL.md"

        if not skill_md_path.exists():
            dimension.add_score("skill_md_existence", 0, 25, "SKILL.md does not exist")
            dimension.add_suggestion("Create comprehensive SKILL.md file")
            return

        try:
            content = skill_md_path.read_text(encoding='utf-8')
            lines = [line for line in content.split('\n') if line.strip()]

            # Score based on length and depth
            line_count = len(lines)
            if line_count >= 400:
                length_score = 25
            elif line_count >= 300:
                length_score = 20
            elif line_count >= 200:
                length_score = 15
            elif line_count >= 100:
                length_score = 10
            else:
                length_score = 5

            dimension.add_score("skill_md_length", length_score, 25,
                               f"SKILL.md has {line_count} lines")

            if line_count < 300:
                dimension.add_suggestion("Expand SKILL.md with more detailed sections")

            # Score frontmatter quality
            frontmatter_score = self._score_frontmatter(content)
            dimension.add_score("skill_md_frontmatter", frontmatter_score, 25,
                               "Frontmatter completeness and accuracy")

            # Score section completeness
            section_score = self._score_sections(content)
            dimension.add_score("skill_md_sections", section_score, 25,
                               "Required and recommended section coverage")

            # Score content depth
            depth_score = self._score_content_depth(content)
            dimension.add_score("skill_md_depth", depth_score, 25,
                               "Content depth and technical detail")

        except Exception as e:
            dimension.add_score("skill_md_readable", 0, 25, f"Error reading SKILL.md: {str(e)}")
            dimension.add_suggestion("Fix SKILL.md file encoding or format issues")

    def _score_frontmatter(self, content: str) -> float:
        """Score SKILL.md frontmatter quality"""
        required_fields = ["Name", "Tier", "Category", "Dependencies", "Author", "Version"]
        recommended_fields = ["Last Updated", "Description"]

        try:
            if not content.startswith('---'):
                return 5  # Partial credit for having some structure

            end_marker = content.find('---', 3)
            if end_marker == -1:
                return 5

            frontmatter_text = content[3:end_marker].strip()
            frontmatter = yaml.safe_load(frontmatter_text)

            if not isinstance(frontmatter, dict):
                return 5

            score = 0

            # Required fields (15 points)
            present_required = sum(1 for field in required_fields if field in frontmatter)
            score += (present_required / len(required_fields)) * 15

            # Recommended fields (5 points)
            present_recommended = sum(1 for field in recommended_fields if field in frontmatter)
            score += (present_recommended / len(recommended_fields)) * 5

            # Quality of field values (5 points)
            quality_bonus = 0
            for field, value in frontmatter.items():
                if isinstance(value, str) and len(value.strip()) > 3:
                    quality_bonus += 0.5

            score += min(quality_bonus, 5)

            return min(score, 25)

        except yaml.YAMLError:
            return 5  # Some credit for attempting frontmatter

    def _score_sections(self, content: str) -> float:
        """Score section completeness"""
        required_sections = ["Description", "Features", "Usage", "Examples"]
        recommended_sections = ["Architecture", "Installation", "Troubleshooting", "Contributing"]

        score = 0

        # Required sections (15 points)
        present_required = 0
        for section in required_sections:
            if re.search(rf'^#+\s*{re.escape(section)}\s*$', content, re.MULTILINE | re.IGNORECASE):
                present_required += 1

        score += (present_required / len(required_sections)) * 15

        # Recommended sections (10 points)
        present_recommended = 0
        for section in recommended_sections:
            if re.search(rf'^#+\s*{re.escape(section)}\s*$', content, re.MULTILINE | re.IGNORECASE):
                present_recommended += 1

        score += (present_recommended / len(recommended_sections)) * 10

        return score

    def _score_content_depth(self, content: str) -> float:
        """Score content depth and technical detail"""
        score = 0

        # Code examples (8 points)
        code_blocks = len(re.findall(r'```[\w]*\n.*?\n```', content, re.DOTALL))
        score += min(code_blocks * 2, 8)

        # Technical depth indicators (8 points)
        depth_indicators = ['API', 'algorithm', 'architecture', 'implementation', 'performance',
                           'scalability', 'security', 'integration', 'configuration', 'parameters']
        depth_score = sum(1 for indicator in depth_indicators if indicator.lower() in content.lower())
        score += min(depth_score * 0.8, 8)

        # Usage examples (9 points)
        example_patterns = [r'Example:', r'Usage:', r'```bash', r'```python', r'```yaml']
        example_count = sum(len(re.findall(pattern, content, re.IGNORECASE)) for pattern in example_patterns)
        score += min(example_count * 1.5, 9)

        return score

    def _score_readme(self, dimension: QualityDimension):
        """Score README.md quality"""
        readme_path = self.skill_path / "README.md"

        if not readme_path.exists():
            dimension.add_score("readme_existence", 10, 25, "README.md exists (partial credit)")
            dimension.add_suggestion("Create README.md with usage instructions")
            return

        try:
            content = readme_path.read_text(encoding='utf-8')

            # Length and substance
            if len(content.strip()) >= 1000:
                length_score = 25
            elif len(content.strip()) >= 500:
                length_score = 20
            elif len(content.strip()) >= 200:
                length_score = 15
            else:
                length_score = 10

            dimension.add_score("readme_quality", length_score, 25,
                               f"README.md content quality ({len(content)} characters)")

            if len(content.strip()) < 500:
                dimension.add_suggestion("Expand README.md with more detailed usage examples")

        except Exception:
            dimension.add_score("readme_readable", 5, 25, "README.md exists but has issues")

    def _score_references(self, dimension: QualityDimension):
        """Score reference documentation quality"""
        references_dir = self.skill_path / "references"

        if not references_dir.exists():
            dimension.add_score("references_existence", 0, 25, "No references directory")
            dimension.add_suggestion("Add references directory with documentation")
            return

        ref_files = list(references_dir.glob("*.md")) + list(references_dir.glob("*.txt"))

        if not ref_files:
            dimension.add_score("references_content", 5, 25, "References directory empty")
            dimension.add_suggestion("Add reference documentation files")
            return

        # Score based on number and quality of reference files
        score = min(len(ref_files) * 5, 20)  # Up to 20 points for multiple files

        # Bonus for substantial content
        total_content = 0
        for ref_file in ref_files:
            try:
                content = ref_file.read_text(encoding='utf-8')
                total_content += len(content.strip())
            except:
                continue

        if total_content >= 2000:
            score += 5  # Bonus for substantial reference content

        dimension.add_score("references_quality", score, 25,
                           f"References: {len(ref_files)} files, {total_content} chars")

    def _score_examples(self, dimension: QualityDimension):
        """Score examples and usage clarity"""
        score = 0

        # Look for example files in various locations
        example_locations = ["examples", "assets", "scripts"]
        example_files = []

        for location in example_locations:
            location_path = self.skill_path / location
            if location_path.exists():
                example_files.extend(location_path.glob("*example*"))
                example_files.extend(location_path.glob("*sample*"))
                example_files.extend(location_path.glob("*demo*"))

        # Score based on example availability
        if len(example_files) >= 3:
            score = 25
        elif len(example_files) >= 2:
            score = 20
        elif len(example_files) >= 1:
            score = 15
        else:
            score = 10
            dimension.add_suggestion("Add more usage examples and sample files")

        dimension.add_score("examples_availability", score, 25,
                           f"Found {len(example_files)} example/sample files")

    def _score_code_quality(self):
        """Score code quality (25% weight)"""
        self.log_verbose("Scoring code quality...")

        dimension = QualityDimension("Code Quality", 0.20, "Quality of Python scripts and implementation")

        scripts_dir = self.skill_path / "scripts"
        if not scripts_dir.exists():
            dimension.add_score("scripts_existence", 0, 100, "No scripts directory")
            dimension.add_suggestion("Create scripts directory with Python files")
            dimension.calculate_final_score()
            self.report.add_dimension(dimension)
            return

        python_files = list(scripts_dir.glob("*.py"))
        if not python_files:
            dimension.add_score("python_scripts", 0, 100, "No Python scripts found")
            dimension.add_suggestion("Add Python scripts to scripts directory")
            dimension.calculate_final_score()
            self.report.add_dimension(dimension)
            return

        # Score script complexity and quality
        self._score_script_complexity(python_files, dimension)

        # Score error handling
        self._score_error_handling(python_files, dimension)

        # Score code structure
        self._score_code_structure(python_files, dimension)

        # Score output format support
        self._score_output_support(python_files, dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_script_complexity(self, python_files: List[Path], dimension: QualityDimension):
        """Score script complexity and sophistication"""
        total_complexity = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')

                # Count lines of code (excluding empty lines and comments)
                lines = content.split('\n')
                loc = len([line for line in lines if line.strip() and not line.strip().startswith('#')])

                # Score based on LOC
                if loc >= 800:
                    complexity_score = 25
                elif loc >= 500:
                    complexity_score = 20
                elif loc >= 300:
                    complexity_score = 15
                elif loc >= 100:
                    complexity_score = 10
                else:
                    complexity_score = 5

                total_complexity += complexity_score

            except Exception:
                continue

        avg_complexity = total_complexity / script_count if script_count > 0 else 0
        dimension.add_score("script_complexity", avg_complexity, 25,
                           f"Average script complexity across {script_count} scripts")

        if avg_complexity < 15:
            dimension.add_suggestion("Consider expanding scripts with more functionality")

    def _score_error_handling(self, python_files: List[Path], dimension: QualityDimension):
        """Score error handling quality"""
        total_error_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                error_score = 0

                # Check for try/except blocks
                try_count = content.count('try:')
                error_score += min(try_count * 5, 15)  # Up to 15 points for try/except

                # Check for specific exception handling
                exception_types = ['Exception', 'ValueError', 'FileNotFoundError', 'KeyError', 'TypeError']
                for exc_type in exception_types:
                    if exc_type in content:
                        error_score += 2  # 2 points per specific exception type

                # Check for logging or error reporting
                if any(indicator in content for indicator in ['print(', 'logging.', 'sys.stderr']):
                    error_score += 5  # 5 points for error reporting

                total_error_score += min(error_score, 25)  # Cap at 25 per script

            except Exception:
                continue

        avg_error_score = total_error_score / script_count if script_count > 0 else 0
        dimension.add_score("error_handling", avg_error_score, 25,
                           f"Error handling quality across {script_count} scripts")

        if avg_error_score < 15:
            dimension.add_suggestion("Improve error handling with try/except blocks and meaningful error messages")

    def _score_code_structure(self, python_files: List[Path], dimension: QualityDimension):
        """Score code structure and organization"""
        total_structure_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                structure_score = 0

                # Check for functions and classes
                function_count = content.count('def ')
                class_count = content.count('class ')

                structure_score += min(function_count * 2, 10)  # Up to 10 points for functions
                structure_score += min(class_count * 3, 9)     # Up to 9 points for classes

                # Check for docstrings
                docstring_patterns = ['"""', "'''", 'def.*:\n.*"""', 'class.*:\n.*"""']
                for pattern in docstring_patterns:
                    if re.search(pattern, content):
                        structure_score += 1  # 1 point per docstring indicator

                # Check for if __name__ == "__main__"
                if 'if __name__ == "__main__"' in content:
                    structure_score += 3

                # Check for imports organization
                if content.lstrip().startswith(('import ', 'from ')):
                    structure_score += 2  # Imports at top

                total_structure_score += min(structure_score, 25)

            except Exception:
                continue

        avg_structure_score = total_structure_score / script_count if script_count > 0 else 0
        dimension.add_score("code_structure", avg_structure_score, 25,
                           f"Code structure quality across {script_count} scripts")

        if avg_structure_score < 15:
            dimension.add_suggestion("Improve code structure with more functions, classes, and documentation")

    def _score_output_support(self, python_files: List[Path], dimension: QualityDimension):
        """Score output format support"""
        total_output_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                output_score = 0

                # Check for JSON support
                if any(indicator in content for indicator in ['json.dump', 'json.load', '--json']):
                    output_score += 12  # JSON support

                # Check for formatted output
                if any(indicator in content for indicator in ['print(f"', 'print("', '.format(', 'f"']):
                    output_score += 8  # Human-readable output

                # Check for argparse help
                if '--help' in content or 'add_help=' in content:
                    output_score += 5  # Help functionality

                total_output_score += min(output_score, 25)

            except Exception:
                continue

        avg_output_score = total_output_score / script_count if script_count > 0 else 0
        dimension.add_score("output_support", avg_output_score, 25,
                           f"Output format support across {script_count} scripts")

        if avg_output_score < 15:
            dimension.add_suggestion("Add support for both JSON and human-readable output formats")

    def _score_completeness(self):
        """Score completeness (25% weight)"""
        self.log_verbose("Scoring completeness...")

        dimension = QualityDimension("Completeness", 0.20, "Completeness of required components and assets")

        # Score directory structure
        self._score_directory_structure(dimension)

        # Score asset availability
        self._score_assets(dimension)

        # Score expected outputs
        self._score_expected_outputs(dimension)

        # Score test coverage
        self._score_test_coverage(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_directory_structure(self, dimension: QualityDimension):
        """Score directory structure completeness"""
        required_dirs = ["scripts"]
        recommended_dirs = ["assets", "references", "expected_outputs"]

        score = 0

        # Required directories (15 points)
        for dir_name in required_dirs:
            if (self.skill_path / dir_name).exists():
                score += 15 / len(required_dirs)

        # Recommended directories (10 points)
        present_recommended = 0
        for dir_name in recommended_dirs:
            if (self.skill_path / dir_name).exists():
                present_recommended += 1

        score += (present_recommended / len(recommended_dirs)) * 10

        dimension.add_score("directory_structure", score, 25,
                           f"Directory structure completeness")

        missing_recommended = [d for d in recommended_dirs if not (self.skill_path / d).exists()]
        if missing_recommended:
            dimension.add_suggestion(f"Add recommended directories: {', '.join(missing_recommended)}")

    def _score_assets(self, dimension: QualityDimension):
        """Score asset availability and quality"""
        assets_dir = self.skill_path / "assets"

        if not assets_dir.exists():
            dimension.add_score("assets_existence", 5, 25, "Assets directory missing")
            dimension.add_suggestion("Create assets directory with sample data")
            return

        asset_files = [f for f in assets_dir.rglob("*") if f.is_file()]

        if not asset_files:
            dimension.add_score("assets_content", 10, 25, "Assets directory empty")
            dimension.add_suggestion("Add sample data files to assets directory")
            return

        # Score based on number and diversity of assets
        score = min(len(asset_files) * 3, 20)  # Up to 20 points for multiple assets

        # Bonus for diverse file types
        extensions = set(f.suffix.lower() for f in asset_files if f.suffix)
        if len(extensions) >= 3:
            score += 5  # Bonus for file type diversity

        dimension.add_score("assets_quality", score, 25,
                           f"Assets: {len(asset_files)} files, {len(extensions)} types")

    def _score_expected_outputs(self, dimension: QualityDimension):
        """Score expected outputs availability"""
        expected_dir = self.skill_path / "expected_outputs"

        if not expected_dir.exists():
            dimension.add_score("expected_outputs", 10, 25, "Expected outputs directory missing")
            dimension.add_suggestion("Add expected_outputs directory with sample results")
            return

        output_files = [f for f in expected_dir.rglob("*") if f.is_file()]

        if len(output_files) >= 3:
            score = 25
        elif len(output_files) >= 2:
            score = 20
        elif len(output_files) >= 1:
            score = 15
        else:
            score = 10
            dimension.add_suggestion("Add expected output files for testing")

        dimension.add_score("expected_outputs", score, 25,
                           f"Expected outputs: {len(output_files)} files")

    def _score_test_coverage(self, dimension: QualityDimension):
        """Score test coverage and validation"""
        # This is a simplified scoring - in a more sophisticated system,
        # this would integrate with actual test runners

        score = 15  # Base score for having a structure

        # Check for test-related files
        test_indicators = ["test", "spec", "check"]
        test_files = []

        for indicator in test_indicators:
            test_files.extend(self.skill_path.rglob(f"*{indicator}*"))

        if test_files:
            score += 10  # Bonus for test files

        dimension.add_score("test_coverage", score, 25,
                           f"Test coverage indicators: {len(test_files)} files")

        if not test_files:
            dimension.add_suggestion("Add test files or validation scripts")

    def _score_usability(self):
        """Score usability (25% weight)"""
        self.log_verbose("Scoring usability...")

        dimension = QualityDimension("Usability", 0.20, "Ease of use and user experience")

        # Score installation simplicity
        self._score_installation(dimension)

        # Score usage clarity
        self._score_usage_clarity(dimension)

        # Score help and documentation accessibility
        self._score_help_accessibility(dimension)

        # Score practical examples
        self._score_practical_examples(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_installation(self, dimension: QualityDimension):
        """Score installation simplicity"""
        # Check for installation complexity indicators
        score = 25  # Start with full points for standard library only approach

        # Check for requirements.txt or setup.py (would reduce score)
        if (self.skill_path / "requirements.txt").exists():
            score -= 5  # Minor penalty for external dependencies
            dimension.add_suggestion("Consider removing external dependencies for easier installation")

        if (self.skill_path / "setup.py").exists():
            score -= 3  # Minor penalty for complex setup

        dimension.add_score("installation_simplicity", max(score, 15), 25,
                           "Installation complexity assessment")

    def _score_usage_clarity(self, dimension: QualityDimension):
        """Score usage clarity"""
        score = 0

        # Check README for usage instructions
        readme_path = self.skill_path / "README.md"
        if readme_path.exists():
            try:
                content = readme_path.read_text(encoding='utf-8').lower()
                if 'usage' in content or 'how to' in content:
                    score += 10
                if 'example' in content:
                    score += 5
            except:
                pass

        # Check scripts for help text quality
        scripts_dir = self.skill_path / "scripts"
        if scripts_dir.exists():
            python_files = list(scripts_dir.glob("*.py"))
            help_quality = 0

            for script_path in python_files:
                try:
                    content = script_path.read_text(encoding='utf-8')
                    if 'argparse' in content and 'help=' in content:
                        help_quality += 2
                except:
                    continue

            score += min(help_quality, 10)  # Up to 10 points for help text

        dimension.add_score("usage_clarity", score, 25, "Usage instructions and help quality")

        if score < 15:
            dimension.add_suggestion("Improve usage documentation and help text")

    def _score_help_accessibility(self, dimension: QualityDimension):
        """Score help and documentation accessibility"""
        score = 0

        # Check for comprehensive help in scripts
        scripts_dir = self.skill_path / "scripts"
        if scripts_dir.exists():
            python_files = list(scripts_dir.glob("*.py"))

            for script_path in python_files:
                try:
                    content = script_path.read_text(encoding='utf-8')

                    # Check for detailed help text
                    if 'epilog=' in content or 'description=' in content:
                        score += 5  # Detailed help

                    # Check for examples in help
                    if 'examples:' in content.lower() or 'example:' in content.lower():
                        score += 3  # Examples in help

                except:
                    continue

        # Check for documentation files
        doc_files = list(self.skill_path.glob("*.md"))
        if len(doc_files) >= 2:
            score += 5  # Multiple documentation files

        dimension.add_score("help_accessibility", min(score, 25), 25,
                           "Help and documentation accessibility")

        if score < 15:
            dimension.add_suggestion("Add more comprehensive help text and documentation")

    def _score_practical_examples(self, dimension: QualityDimension):
        """Score practical examples quality"""
        score = 0

        # Look for example files
        example_patterns = ["*example*", "*sample*", "*demo*", "*tutorial*"]
        example_files = []

        for pattern in example_patterns:
            example_files.extend(self.skill_path.rglob(pattern))

        # Score based on example availability and quality
        if len(example_files) >= 5:
            score = 25
        elif len(example_files) >= 3:
            score = 20
        elif len(example_files) >= 2:
            score = 15
        elif len(example_files) >= 1:
            score = 10
        else:
            score = 5
            dimension.add_suggestion("Add more practical examples and sample files")

        dimension.add_score("practical_examples", score, 25,
                           f"Practical examples: {len(example_files)} files")

    def _score_security(self):
        """Score security quality (20% weight)"""
        self.log_verbose("Scoring security quality...")

        dimension = QualityDimension("Security", 0.20, "Security practices and vulnerability prevention")

        # Find Python scripts
        python_files = list(self.skill_path.rglob("*.py"))

        # Filter out test files and __pycache__
        python_files = [f for f in python_files
                       if "__pycache__" not in str(f) and "test_" not in f.name]

        if not python_files:
            dimension.add_score("scripts_existence", 25, 25,
                               "No scripts directory - no script security concerns")
            dimension.calculate_final_score()
            self.report.add_dimension(dimension)
            return

        # Use SecurityScorer module
        try:
            scorer = SecurityScorer(python_files, verbose=self.verbose)
            result = scorer.get_overall_score()

            # Extract scores from SecurityScorer result
            sensitive_data_score = result.get("sensitive_data_exposure", {}).get("score", 0)
            file_ops_score = result.get("safe_file_operations", {}).get("score", 0)
            command_injection_score = result.get("command_injection_prevention", {}).get("score", 0)
            input_validation_score = result.get("input_validation", {}).get("score", 0)

            dimension.add_score("sensitive_data_exposure", sensitive_data_score, 25,
                               "Detection and prevention of hardcoded credentials")
            dimension.add_score("safe_file_operations", file_ops_score, 25,
                               "Prevention of path traversal vulnerabilities")
            dimension.add_score("command_injection_prevention", command_injection_score, 25,
                               "Prevention of command injection vulnerabilities")
            dimension.add_score("input_validation", input_validation_score, 25,
                               "Quality of input validation and error handling")

            # Add suggestions from SecurityScorer
            for issue in result.get("issues", []):
                dimension.add_suggestion(issue)

        except Exception as e:
            self.log_verbose(f"Security scoring failed: {str(e)}")
            dimension.add_score("security_error", 0, 100, f"Security scoring failed: {str(e)}")
            dimension.add_suggestion("Fix security scoring module integration")

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)


class QualityReportFormatter:
    """Formats quality reports for output"""

    @staticmethod
    def format_json(report: QualityReport) -> str:
        """Format report as JSON"""
        return json.dumps({
            "skill_path": report.skill_path,
            "timestamp": report.timestamp,
            "overall_score": round(report.overall_score, 1),
            "letter_grade": report.letter_grade,
            "tier_recommendation": report.tier_recommendation,
            "summary_stats": report.summary_stats,
            "dimensions": {
                name: {
                    "name": dim.name,
                    "weight": dim.weight,
                    "score": round(dim.score, 1),
                    "description": dim.description,
                    "details": dim.details,
                    "suggestions": dim.suggestions
                }
                for name, dim in report.dimensions.items()
            },
            "improvement_roadmap": report.improvement_roadmap
        }, indent=2)

    @staticmethod
    def format_human_readable(report: QualityReport, detailed: bool = False) -> str:
        """Format report as human-readable text"""
        lines = []
        lines.append("=" * 70)
        lines.append("SKILL QUALITY ASSESSMENT REPORT")
        lines.append("=" * 70)
        lines.append(f"Skill: {report.skill_path}")
        lines.append(f"Timestamp: {report.timestamp}")
        lines.append(f"Overall Score: {report.overall_score:.1f}/100 ({report.letter_grade})")
        lines.append(f"Recommended Tier: {report.tier_recommendation}")
        lines.append("")

        # Dimension scores
        lines.append("QUALITY DIMENSIONS:")
        for name, dimension in report.dimensions.items():
            lines.append(f"  {name}: {dimension.score:.1f}/100 ({dimension.weight * 100:.0f}% weight)")
            if detailed and dimension.details:
                for component, details in dimension.details.items():
                    lines.append(f"    • {component}: {details['score']:.1f}/{details['max_score']} - {details['details']}")
            lines.append("")

        # Summary statistics
        if report.summary_stats:
            lines.append("SUMMARY STATISTICS:")
            lines.append(f"  Highest Dimension: {report.summary_stats['highest_dimension']}")
            lines.append(f"  Lowest Dimension: {report.summary_stats['lowest_dimension']}")
            lines.append(f"  Dimensions Above 70%: {report.summary_stats['dimensions_above_70']}")
            lines.append(f"  Dimensions Below 50%: {report.summary_stats['dimensions_below_50']}")
            lines.append("")

        # Improvement roadmap
        if report.improvement_roadmap:
            lines.append("IMPROVEMENT ROADMAP:")
            for i, item in enumerate(report.improvement_roadmap[:5], 1):
                priority_symbol = "🔴" if item["priority"] == "HIGH" else "🟡" if item["priority"] == "MEDIUM" else "🟢"
                lines.append(f"  {i}. {priority_symbol} [{item['dimension']}] {item['suggestion']}")
            lines.append("")

        return "\n".join(lines)


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(
        description="Score skill quality across multiple dimensions",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python quality_scorer.py engineering/my-skill
  python quality_scorer.py engineering/my-skill --detailed --json
  python quality_scorer.py engineering/my-skill --minimum-score 75

Quality Dimensions (each 25%):
  Documentation - SKILL.md quality, README, references, examples
  Code Quality   - Script complexity, error handling, structure, output
  Completeness   - Directory structure, assets, expected outputs, tests
  Usability      - Installation simplicity, usage clarity, help accessibility

Letter Grades: A+ (95+), A (90+), A- (85+), B+ (80+), B (75+), B- (70+), C+ (65+), C (60+), C- (55+), D (50+), F (<50)
        """
    )

    parser.add_argument("skill_path",
                       help="Path to the skill directory to assess")
    parser.add_argument("--detailed",
                       action="store_true",
                       help="Show detailed component scores")
    parser.add_argument("--minimum-score",
                       type=float,
                       default=0,
                       help="Minimum acceptable score (exit with error if below)")
    parser.add_argument("--json",
                       action="store_true",
                       help="Output results in JSON format")
    parser.add_argument("--verbose",
                       action="store_true",
                       help="Enable verbose logging")

    args = parser.parse_args()

    try:
        # Create scorer and assess quality
        scorer = QualityScorer(args.skill_path, args.detailed, args.verbose)
        report = scorer.assess_quality()

        # Format and output report
        if args.json:
            print(QualityReportFormatter.format_json(report))
        else:
            print(QualityReportFormatter.format_human_readable(report, args.detailed))

        # Check minimum score requirement
        if report.overall_score < args.minimum_score:
            print(f"\nERROR: Quality score {report.overall_score:.1f} is below minimum {args.minimum_score}", file=sys.stderr)
            sys.exit(1)

        # Exit with different codes based on grade
        if report.letter_grade in ["A+", "A", "A-"]:
            sys.exit(0)  # Excellent
        elif report.letter_grade in ["B+", "B", "B-"]:
            sys.exit(0)  # Good
        elif report.letter_grade in ["C+", "C", "C-"]:
            sys.exit(0)  # Acceptable
        elif report.letter_grade == "D":
            sys.exit(2)  # Needs improvement
        else:  # F
            sys.exit(1)  # Poor quality

    except KeyboardInterrupt:
        print("\nQuality assessment interrupted by user", file=sys.stderr)
        sys.exit(130)
    except Exception as e:
        print(f"Quality assessment failed: {str(e)}", file=sys.stderr)
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()