claude-skills-reference/engineering/skill-tester/scripts/quality_scorer.py

#!/usr/bin/env python3
"""
Quality Scorer - Scores skills across multiple quality dimensions

This script provides comprehensive quality assessment for skills in the claude-skills
ecosystem by evaluating documentation, code quality, completeness, and usability.
Generates letter grades, tier recommendations, and improvement roadmaps.

Usage:
    python quality_scorer.py <skill_path> [--detailed] [--minimum-score SCORE] [--json]

Author: Claude Skills Engineering Team
Version: 1.0.0
Dependencies: Python Standard Library Only
"""

import argparse
import ast
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
try:
    import yaml
except ImportError:
    # Minimal YAML subset: parse simple key: value frontmatter without pyyaml
    class _YamlStub:
        class YAMLError(Exception):
            pass
        @staticmethod
        def safe_load(text):
            result = {}
            for line in text.strip().splitlines():
                if ':' in line:
                    key, _, value = line.partition(':')
                    result[key.strip()] = value.strip()
            return result if result else None
    yaml = _YamlStub()


class QualityDimension:
    """Represents a quality scoring dimension"""

    def __init__(self, name: str, weight: float, description: str):
        self.name = name
        self.weight = weight
        self.description = description
        self.score = 0.0
        self.max_score = 100.0
        self.details = {}
        self.suggestions = []

    def add_score(self, component: str, score: float, max_score: float, details: str = ""):
        """Add a component score"""
        self.details[component] = {
            "score": score,
            "max_score": max_score,
            "percentage": (score / max_score * 100) if max_score > 0 else 0,
            "details": details
        }

    def calculate_final_score(self):
        """Calculate the final weighted score for this dimension"""
        if not self.details:
            self.score = 0.0
            return

        total_score = sum(detail["score"] for detail in self.details.values())
        total_max = sum(detail["max_score"] for detail in self.details.values())

        self.score = (total_score / total_max * 100) if total_max > 0 else 0.0

    def add_suggestion(self, suggestion: str):
        """Add an improvement suggestion"""
        self.suggestions.append(suggestion)


class QualityReport:
    """Container for quality assessment results"""

    def __init__(self, skill_path: str):
        self.skill_path = skill_path
        self.timestamp = datetime.utcnow().isoformat() + "Z"
        self.dimensions = {}
        self.overall_score = 0.0
        self.letter_grade = "F"
        self.tier_recommendation = "BASIC"
        self.improvement_roadmap = []
        self.summary_stats = {}

    def add_dimension(self, dimension: QualityDimension):
        """Add a quality dimension"""
        self.dimensions[dimension.name] = dimension

    def calculate_overall_score(self):
        """Calculate overall weighted score"""
        if not self.dimensions:
            return

        total_weighted_score = 0.0
        total_weight = 0.0

        for dimension in self.dimensions.values():
            total_weighted_score += dimension.score * dimension.weight
            total_weight += dimension.weight

        self.overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0

        # Calculate letter grade
        if self.overall_score >= 95:
            self.letter_grade = "A+"
        elif self.overall_score >= 90:
            self.letter_grade = "A"
        elif self.overall_score >= 85:
            self.letter_grade = "A-"
        elif self.overall_score >= 80:
            self.letter_grade = "B+"
        elif self.overall_score >= 75:
            self.letter_grade = "B"
        elif self.overall_score >= 70:
            self.letter_grade = "B-"
        elif self.overall_score >= 65:
            self.letter_grade = "C+"
        elif self.overall_score >= 60:
            self.letter_grade = "C"
        elif self.overall_score >= 55:
            self.letter_grade = "C-"
        elif self.overall_score >= 50:
            self.letter_grade = "D"
        else:
            self.letter_grade = "F"

        # Recommend tier based on overall score and specific criteria
        self._calculate_tier_recommendation()

        # Generate improvement roadmap
        self._generate_improvement_roadmap()

        # Calculate summary statistics
        self._calculate_summary_stats()

    def _calculate_tier_recommendation(self):
        """Calculate recommended tier based on quality scores"""
        doc_score = self.dimensions.get("Documentation", QualityDimension("", 0, "")).score
        code_score = self.dimensions.get("Code Quality", QualityDimension("", 0, "")).score
        completeness_score = self.dimensions.get("Completeness", QualityDimension("", 0, "")).score
        usability_score = self.dimensions.get("Usability", QualityDimension("", 0, "")).score

        # POWERFUL tier requirements (all dimensions must be strong)
        if (self.overall_score >= 80 and
            all(score >= 75 for score in [doc_score, code_score, completeness_score, usability_score])):
            self.tier_recommendation = "POWERFUL"

        # STANDARD tier requirements (most dimensions good)
        elif (self.overall_score >= 70 and
              sum(1 for score in [doc_score, code_score, completeness_score, usability_score] if score >= 65) >= 3):
            self.tier_recommendation = "STANDARD"

        # BASIC tier (minimum viable quality)
        else:
            self.tier_recommendation = "BASIC"

    def _generate_improvement_roadmap(self):
        """Generate prioritized improvement suggestions"""
        all_suggestions = []

        # Collect suggestions from all dimensions with scores
        for dim_name, dimension in self.dimensions.items():
            for suggestion in dimension.suggestions:
                priority = "HIGH" if dimension.score < 60 else "MEDIUM" if dimension.score < 75 else "LOW"
                all_suggestions.append({
                    "priority": priority,
                    "dimension": dim_name,
                    "suggestion": suggestion,
                    "current_score": dimension.score
                })

        # Sort by priority and score
        priority_order = {"HIGH": 0, "MEDIUM": 1, "LOW": 2}
        all_suggestions.sort(key=lambda x: (priority_order[x["priority"]], x["current_score"]))

        self.improvement_roadmap = all_suggestions[:10]  # Top 10 suggestions

    def _calculate_summary_stats(self):
        """Calculate summary statistics"""
        scores = [dim.score for dim in self.dimensions.values()]

        self.summary_stats = {
            "highest_dimension": max(self.dimensions.items(), key=lambda x: x[1].score)[0] if scores else "None",
            "lowest_dimension": min(self.dimensions.items(), key=lambda x: x[1].score)[0] if scores else "None",
            "score_variance": sum((score - self.overall_score) ** 2 for score in scores) / len(scores) if scores else 0,
            "dimensions_above_70": sum(1 for score in scores if score >= 70),
            "dimensions_below_50": sum(1 for score in scores if score < 50)
        }


class QualityScorer:
    """Main quality scoring engine"""

    def __init__(self, skill_path: str, detailed: bool = False, verbose: bool = False):
        self.skill_path = Path(skill_path).resolve()
        self.detailed = detailed
        self.verbose = verbose
        self.report = QualityReport(str(self.skill_path))

    def log_verbose(self, message: str):
        """Log verbose message if verbose mode enabled"""
        if self.verbose:
            print(f"[VERBOSE] {message}", file=sys.stderr)

    def assess_quality(self) -> QualityReport:
        """Main quality assessment entry point"""
        try:
            self.log_verbose(f"Starting quality assessment for {self.skill_path}")

            # Check if skill path exists
            if not self.skill_path.exists():
                raise ValueError(f"Skill path does not exist: {self.skill_path}")

            # Score each dimension
            self._score_documentation()
            self._score_code_quality()
            self._score_completeness()
            self._score_usability()

            # Calculate overall metrics
            self.report.calculate_overall_score()

            self.log_verbose(f"Quality assessment completed. Overall score: {self.report.overall_score:.1f}")

        except Exception as e:
            print(f"Quality assessment failed: {str(e)}", file=sys.stderr)
            raise

        return self.report

    def _score_documentation(self):
        """Score documentation quality (25% weight)"""
        self.log_verbose("Scoring documentation quality...")

        dimension = QualityDimension("Documentation", 0.25, "Quality of documentation and written materials")

        # Score SKILL.md
        self._score_skill_md(dimension)

        # Score README.md
        self._score_readme(dimension)

        # Score reference documentation
        self._score_references(dimension)

        # Score examples and usage clarity
        self._score_examples(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_skill_md(self, dimension: QualityDimension):
        """Score SKILL.md quality"""
        skill_md_path = self.skill_path / "SKILL.md"

        if not skill_md_path.exists():
            dimension.add_score("skill_md_existence", 0, 25, "SKILL.md does not exist")
            dimension.add_suggestion("Create comprehensive SKILL.md file")
            return

        try:
            content = skill_md_path.read_text(encoding='utf-8')
            lines = [line for line in content.split('\n') if line.strip()]

            # Score based on length and depth
            line_count = len(lines)
            if line_count >= 400:
                length_score = 25
            elif line_count >= 300:
                length_score = 20
            elif line_count >= 200:
                length_score = 15
            elif line_count >= 100:
                length_score = 10
            else:
                length_score = 5

            dimension.add_score("skill_md_length", length_score, 25,
                               f"SKILL.md has {line_count} lines")

            if line_count < 300:
                dimension.add_suggestion("Expand SKILL.md with more detailed sections")

            # Score frontmatter quality
            frontmatter_score = self._score_frontmatter(content)
            dimension.add_score("skill_md_frontmatter", frontmatter_score, 25,
                               "Frontmatter completeness and accuracy")

            # Score section completeness
            section_score = self._score_sections(content)
            dimension.add_score("skill_md_sections", section_score, 25,
                               "Required and recommended section coverage")

            # Score content depth
            depth_score = self._score_content_depth(content)
            dimension.add_score("skill_md_depth", depth_score, 25,
                               "Content depth and technical detail")

        except Exception as e:
            dimension.add_score("skill_md_readable", 0, 25, f"Error reading SKILL.md: {str(e)}")
            dimension.add_suggestion("Fix SKILL.md file encoding or format issues")

    def _score_frontmatter(self, content: str) -> float:
        """Score SKILL.md frontmatter quality"""
        required_fields = ["Name", "Tier", "Category", "Dependencies", "Author", "Version"]
        recommended_fields = ["Last Updated", "Description"]

        try:
            if not content.startswith('---'):
                return 5  # Partial credit for having some structure

            end_marker = content.find('---', 3)
            if end_marker == -1:
                return 5

            frontmatter_text = content[3:end_marker].strip()
            frontmatter = yaml.safe_load(frontmatter_text)

            if not isinstance(frontmatter, dict):
                return 5

            score = 0

            # Required fields (15 points)
            present_required = sum(1 for field in required_fields if field in frontmatter)
            score += (present_required / len(required_fields)) * 15

            # Recommended fields (5 points)
            present_recommended = sum(1 for field in recommended_fields if field in frontmatter)
            score += (present_recommended / len(recommended_fields)) * 5

            # Quality of field values (5 points)
            quality_bonus = 0
            for field, value in frontmatter.items():
                if isinstance(value, str) and len(value.strip()) > 3:
                    quality_bonus += 0.5

            score += min(quality_bonus, 5)

            return min(score, 25)

        except yaml.YAMLError:
            return 5  # Some credit for attempting frontmatter

    def _score_sections(self, content: str) -> float:
        """Score section completeness"""
        required_sections = ["Description", "Features", "Usage", "Examples"]
        recommended_sections = ["Architecture", "Installation", "Troubleshooting", "Contributing"]

        score = 0

        # Required sections (15 points)
        present_required = 0
        for section in required_sections:
            if re.search(rf'^#+\s*{re.escape(section)}\s*$', content, re.MULTILINE | re.IGNORECASE):
                present_required += 1

        score += (present_required / len(required_sections)) * 15

        # Recommended sections (10 points)
        present_recommended = 0
        for section in recommended_sections:
            if re.search(rf'^#+\s*{re.escape(section)}\s*$', content, re.MULTILINE | re.IGNORECASE):
                present_recommended += 1

        score += (present_recommended / len(recommended_sections)) * 10

        return score

    def _score_content_depth(self, content: str) -> float:
        """Score content depth and technical detail"""
        score = 0

        # Code examples (8 points)
        code_blocks = len(re.findall(r'```[\w]*\n.*?\n```', content, re.DOTALL))
        score += min(code_blocks * 2, 8)

        # Technical depth indicators (8 points)
        depth_indicators = ['API', 'algorithm', 'architecture', 'implementation', 'performance',
                           'scalability', 'security', 'integration', 'configuration', 'parameters']
        depth_score = sum(1 for indicator in depth_indicators if indicator.lower() in content.lower())
        score += min(depth_score * 0.8, 8)

        # Usage examples (9 points)
        example_patterns = [r'Example:', r'Usage:', r'```bash', r'```python', r'```yaml']
        example_count = sum(len(re.findall(pattern, content, re.IGNORECASE)) for pattern in example_patterns)
        score += min(example_count * 1.5, 9)

        return score

    def _score_readme(self, dimension: QualityDimension):
        """Score README.md quality"""
        readme_path = self.skill_path / "README.md"

        if not readme_path.exists():
            dimension.add_score("readme_existence", 10, 25, "README.md exists (partial credit)")
            dimension.add_suggestion("Create README.md with usage instructions")
            return

        try:
            content = readme_path.read_text(encoding='utf-8')

            # Length and substance
            if len(content.strip()) >= 1000:
                length_score = 25
            elif len(content.strip()) >= 500:
                length_score = 20
            elif len(content.strip()) >= 200:
                length_score = 15
            else:
                length_score = 10

            dimension.add_score("readme_quality", length_score, 25,
                               f"README.md content quality ({len(content)} characters)")

            if len(content.strip()) < 500:
                dimension.add_suggestion("Expand README.md with more detailed usage examples")

        except Exception:
            dimension.add_score("readme_readable", 5, 25, "README.md exists but has issues")

    def _score_references(self, dimension: QualityDimension):
        """Score reference documentation quality"""
        references_dir = self.skill_path / "references"

        if not references_dir.exists():
            dimension.add_score("references_existence", 0, 25, "No references directory")
            dimension.add_suggestion("Add references directory with documentation")
            return

        ref_files = list(references_dir.glob("*.md")) + list(references_dir.glob("*.txt"))

        if not ref_files:
            dimension.add_score("references_content", 5, 25, "References directory empty")
            dimension.add_suggestion("Add reference documentation files")
            return

        # Score based on number and quality of reference files
        score = min(len(ref_files) * 5, 20)  # Up to 20 points for multiple files

        # Bonus for substantial content
        total_content = 0
        for ref_file in ref_files:
            try:
                content = ref_file.read_text(encoding='utf-8')
                total_content += len(content.strip())
            except:
                continue

        if total_content >= 2000:
            score += 5  # Bonus for substantial reference content

        dimension.add_score("references_quality", score, 25,
                           f"References: {len(ref_files)} files, {total_content} chars")

    def _score_examples(self, dimension: QualityDimension):
        """Score examples and usage clarity"""
        score = 0

        # Look for example files in various locations
        example_locations = ["examples", "assets", "scripts"]
        example_files = []

        for location in example_locations:
            location_path = self.skill_path / location
            if location_path.exists():
                example_files.extend(location_path.glob("*example*"))
                example_files.extend(location_path.glob("*sample*"))
                example_files.extend(location_path.glob("*demo*"))

        # Score based on example availability
        if len(example_files) >= 3:
            score = 25
        elif len(example_files) >= 2:
            score = 20
        elif len(example_files) >= 1:
            score = 15
        else:
            score = 10
            dimension.add_suggestion("Add more usage examples and sample files")

        dimension.add_score("examples_availability", score, 25,
                           f"Found {len(example_files)} example/sample files")

    def _score_code_quality(self):
        """Score code quality (25% weight)"""
        self.log_verbose("Scoring code quality...")

        dimension = QualityDimension("Code Quality", 0.25, "Quality of Python scripts and implementation")

        scripts_dir = self.skill_path / "scripts"
        if not scripts_dir.exists():
            dimension.add_score("scripts_existence", 0, 100, "No scripts directory")
            dimension.add_suggestion("Create scripts directory with Python files")
            dimension.calculate_final_score()
            self.report.add_dimension(dimension)
            return

        python_files = list(scripts_dir.glob("*.py"))
        if not python_files:
            dimension.add_score("python_scripts", 0, 100, "No Python scripts found")
            dimension.add_suggestion("Add Python scripts to scripts directory")
            dimension.calculate_final_score()
            self.report.add_dimension(dimension)
            return

        # Score script complexity and quality
        self._score_script_complexity(python_files, dimension)

        # Score error handling
        self._score_error_handling(python_files, dimension)

        # Score code structure
        self._score_code_structure(python_files, dimension)

        # Score output format support
        self._score_output_support(python_files, dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_script_complexity(self, python_files: List[Path], dimension: QualityDimension):
        """Score script complexity and sophistication"""
        total_complexity = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')

                # Count lines of code (excluding empty lines and comments)
                lines = content.split('\n')
                loc = len([line for line in lines if line.strip() and not line.strip().startswith('#')])

                # Score based on LOC
                if loc >= 800:
                    complexity_score = 25
                elif loc >= 500:
                    complexity_score = 20
                elif loc >= 300:
                    complexity_score = 15
                elif loc >= 100:
                    complexity_score = 10
                else:
                    complexity_score = 5

                total_complexity += complexity_score

            except Exception:
                continue

        avg_complexity = total_complexity / script_count if script_count > 0 else 0
        dimension.add_score("script_complexity", avg_complexity, 25,
                           f"Average script complexity across {script_count} scripts")

        if avg_complexity < 15:
            dimension.add_suggestion("Consider expanding scripts with more functionality")

    def _score_error_handling(self, python_files: List[Path], dimension: QualityDimension):
        """Score error handling quality"""
        total_error_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                error_score = 0

                # Check for try/except blocks
                try_count = content.count('try:')
                error_score += min(try_count * 5, 15)  # Up to 15 points for try/except

                # Check for specific exception handling
                exception_types = ['Exception', 'ValueError', 'FileNotFoundError', 'KeyError', 'TypeError']
                for exc_type in exception_types:
                    if exc_type in content:
                        error_score += 2  # 2 points per specific exception type

                # Check for logging or error reporting
                if any(indicator in content for indicator in ['print(', 'logging.', 'sys.stderr']):
                    error_score += 5  # 5 points for error reporting

                total_error_score += min(error_score, 25)  # Cap at 25 per script

            except Exception:
                continue

        avg_error_score = total_error_score / script_count if script_count > 0 else 0
        dimension.add_score("error_handling", avg_error_score, 25,
                           f"Error handling quality across {script_count} scripts")

        if avg_error_score < 15:
            dimension.add_suggestion("Improve error handling with try/except blocks and meaningful error messages")

    def _score_code_structure(self, python_files: List[Path], dimension: QualityDimension):
        """Score code structure and organization"""
        total_structure_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                structure_score = 0

                # Check for functions and classes
                function_count = content.count('def ')
                class_count = content.count('class ')

                structure_score += min(function_count * 2, 10)  # Up to 10 points for functions
                structure_score += min(class_count * 3, 9)     # Up to 9 points for classes

                # Check for docstrings
                docstring_patterns = ['"""', "'''", 'def.*:\n.*"""', 'class.*:\n.*"""']
                for pattern in docstring_patterns:
                    if re.search(pattern, content):
                        structure_score += 1  # 1 point per docstring indicator

                # Check for if __name__ == "__main__"
                if 'if __name__ == "__main__"' in content:
                    structure_score += 3

                # Check for imports organization
                if content.lstrip().startswith(('import ', 'from ')):
                    structure_score += 2  # Imports at top

                total_structure_score += min(structure_score, 25)

            except Exception:
                continue

        avg_structure_score = total_structure_score / script_count if script_count > 0 else 0
        dimension.add_score("code_structure", avg_structure_score, 25,
                           f"Code structure quality across {script_count} scripts")

        if avg_structure_score < 15:
            dimension.add_suggestion("Improve code structure with more functions, classes, and documentation")

    def _score_output_support(self, python_files: List[Path], dimension: QualityDimension):
        """Score output format support"""
        total_output_score = 0
        script_count = len(python_files)

        for script_path in python_files:
            try:
                content = script_path.read_text(encoding='utf-8')
                output_score = 0

                # Check for JSON support
                if any(indicator in content for indicator in ['json.dump', 'json.load', '--json']):
                    output_score += 12  # JSON support

                # Check for formatted output
                if any(indicator in content for indicator in ['print(f"', 'print("', '.format(', 'f"']):
                    output_score += 8  # Human-readable output

                # Check for argparse help
                if '--help' in content or 'add_help=' in content:
                    output_score += 5  # Help functionality

                total_output_score += min(output_score, 25)

            except Exception:
                continue

        avg_output_score = total_output_score / script_count if script_count > 0 else 0
        dimension.add_score("output_support", avg_output_score, 25,
                           f"Output format support across {script_count} scripts")

        if avg_output_score < 15:
            dimension.add_suggestion("Add support for both JSON and human-readable output formats")

    def _score_completeness(self):
        """Score completeness (25% weight)"""
        self.log_verbose("Scoring completeness...")

        dimension = QualityDimension("Completeness", 0.25, "Completeness of required components and assets")

        # Score directory structure
        self._score_directory_structure(dimension)

        # Score asset availability
        self._score_assets(dimension)

        # Score expected outputs
        self._score_expected_outputs(dimension)

        # Score test coverage
        self._score_test_coverage(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_directory_structure(self, dimension: QualityDimension):
        """Score directory structure completeness"""
        required_dirs = ["scripts"]
        recommended_dirs = ["assets", "references", "expected_outputs"]

        score = 0

        # Required directories (15 points)
        for dir_name in required_dirs:
            if (self.skill_path / dir_name).exists():
                score += 15 / len(required_dirs)

        # Recommended directories (10 points)
        present_recommended = 0
        for dir_name in recommended_dirs:
            if (self.skill_path / dir_name).exists():
                present_recommended += 1

        score += (present_recommended / len(recommended_dirs)) * 10

        dimension.add_score("directory_structure", score, 25,
                           f"Directory structure completeness")

        missing_recommended = [d for d in recommended_dirs if not (self.skill_path / d).exists()]
        if missing_recommended:
            dimension.add_suggestion(f"Add recommended directories: {', '.join(missing_recommended)}")

    def _score_assets(self, dimension: QualityDimension):
        """Score asset availability and quality"""
        assets_dir = self.skill_path / "assets"

        if not assets_dir.exists():
            dimension.add_score("assets_existence", 5, 25, "Assets directory missing")
            dimension.add_suggestion("Create assets directory with sample data")
            return

        asset_files = [f for f in assets_dir.rglob("*") if f.is_file()]

        if not asset_files:
            dimension.add_score("assets_content", 10, 25, "Assets directory empty")
            dimension.add_suggestion("Add sample data files to assets directory")
            return

        # Score based on number and diversity of assets
        score = min(len(asset_files) * 3, 20)  # Up to 20 points for multiple assets

        # Bonus for diverse file types
        extensions = set(f.suffix.lower() for f in asset_files if f.suffix)
        if len(extensions) >= 3:
            score += 5  # Bonus for file type diversity

        dimension.add_score("assets_quality", score, 25,
                           f"Assets: {len(asset_files)} files, {len(extensions)} types")

    def _score_expected_outputs(self, dimension: QualityDimension):
        """Score expected outputs availability"""
        expected_dir = self.skill_path / "expected_outputs"

        if not expected_dir.exists():
            dimension.add_score("expected_outputs", 10, 25, "Expected outputs directory missing")
            dimension.add_suggestion("Add expected_outputs directory with sample results")
            return

        output_files = [f for f in expected_dir.rglob("*") if f.is_file()]

        if len(output_files) >= 3:
            score = 25
        elif len(output_files) >= 2:
            score = 20
        elif len(output_files) >= 1:
            score = 15
        else:
            score = 10
            dimension.add_suggestion("Add expected output files for testing")

        dimension.add_score("expected_outputs", score, 25,
                           f"Expected outputs: {len(output_files)} files")

    def _score_test_coverage(self, dimension: QualityDimension):
        """Score test coverage and validation"""
        # This is a simplified scoring - in a more sophisticated system,
        # this would integrate with actual test runners

        score = 15  # Base score for having a structure

        # Check for test-related files
        test_indicators = ["test", "spec", "check"]
        test_files = []

        for indicator in test_indicators:
            test_files.extend(self.skill_path.rglob(f"*{indicator}*"))

        if test_files:
            score += 10  # Bonus for test files

        dimension.add_score("test_coverage", score, 25,
                           f"Test coverage indicators: {len(test_files)} files")

        if not test_files:
            dimension.add_suggestion("Add test files or validation scripts")

    def _score_usability(self):
        """Score usability (25% weight)"""
        self.log_verbose("Scoring usability...")

        dimension = QualityDimension("Usability", 0.25, "Ease of use and user experience")

        # Score installation simplicity
        self._score_installation(dimension)

        # Score usage clarity
        self._score_usage_clarity(dimension)

        # Score help and documentation accessibility
        self._score_help_accessibility(dimension)

        # Score practical examples
        self._score_practical_examples(dimension)

        dimension.calculate_final_score()
        self.report.add_dimension(dimension)

    def _score_installation(self, dimension: QualityDimension):
        """Score installation simplicity"""
        # Check for installation complexity indicators
        score = 25  # Start with full points for standard library only approach

        # Check for requirements.txt or setup.py (would reduce score)
        if (self.skill_path / "requirements.txt").exists():
            score -= 5  # Minor penalty for external dependencies
            dimension.add_suggestion("Consider removing external dependencies for easier installation")

        if (self.skill_path / "setup.py").exists():
            score -= 3  # Minor penalty for complex setup

        dimension.add_score("installation_simplicity", max(score, 15), 25,
                           "Installation complexity assessment")

    def _score_usage_clarity(self, dimension: QualityDimension):
        """Score usage clarity"""
        score = 0

        # Check README for usage instructions
        readme_path = self.skill_path / "README.md"
        if readme_path.exists():
            try:
                content = readme_path.read_text(encoding='utf-8').lower()
                if 'usage' in content or 'how to' in content:
                    score += 10
                if 'example' in content:
                    score += 5
            except:
                pass

        # Check scripts for help text quality
        scripts_dir = self.skill_path / "scripts"
        if scripts_dir.exists():
            python_files = list(scripts_dir.glob("*.py"))
            help_quality = 0

            for script_path in python_files:
                try:
                    content = script_path.read_text(encoding='utf-8')
                    if 'argparse' in content and 'help=' in content:
                        help_quality += 2
                except:
                    continue

            score += min(help_quality, 10)  # Up to 10 points for help text

        dimension.add_score("usage_clarity", score, 25, "Usage instructions and help quality")

        if score < 15:
            dimension.add_suggestion("Improve usage documentation and help text")

    def _score_help_accessibility(self, dimension: QualityDimension):
        """Score help and documentation accessibility"""
        score = 0

        # Check for comprehensive help in scripts
        scripts_dir = self.skill_path / "scripts"
        if scripts_dir.exists():
            python_files = list(scripts_dir.glob("*.py"))

            for script_path in python_files:
                try:
                    content = script_path.read_text(encoding='utf-8')

                    # Check for detailed help text
                    if 'epilog=' in content or 'description=' in content:
                        score += 5  # Detailed help

                    # Check for examples in help
                    if 'examples:' in content.lower() or 'example:' in content.lower():
                        score += 3  # Examples in help

                except:
                    continue

        # Check for documentation files
        doc_files = list(self.skill_path.glob("*.md"))
        if len(doc_files) >= 2:
            score += 5  # Multiple documentation files

        dimension.add_score("help_accessibility", min(score, 25), 25,
                           "Help and documentation accessibility")

        if score < 15:
            dimension.add_suggestion("Add more comprehensive help text and documentation")

    def _score_practical_examples(self, dimension: QualityDimension):
        """Score practical examples quality"""
        score = 0

        # Look for example files
        example_patterns = ["*example*", "*sample*", "*demo*", "*tutorial*"]
        example_files = []

        for pattern in example_patterns:
            example_files.extend(self.skill_path.rglob(pattern))

        # Score based on example availability and quality
        if len(example_files) >= 5:
            score = 25
        elif len(example_files) >= 3:
            score = 20
        elif len(example_files) >= 2:
            score = 15
        elif len(example_files) >= 1:
            score = 10
        else:
            score = 5
            dimension.add_suggestion("Add more practical examples and sample files")

        dimension.add_score("practical_examples", score, 25,
                           f"Practical examples: {len(example_files)} files")


class QualityReportFormatter:
    """Formats quality reports for output"""

    @staticmethod
    def format_json(report: QualityReport) -> str:
        """Format report as JSON"""
        return json.dumps({
            "skill_path": report.skill_path,
            "timestamp": report.timestamp,
            "overall_score": round(report.overall_score, 1),
            "letter_grade": report.letter_grade,
            "tier_recommendation": report.tier_recommendation,
            "summary_stats": report.summary_stats,
            "dimensions": {
                name: {
                    "name": dim.name,
                    "weight": dim.weight,
                    "score": round(dim.score, 1),
                    "description": dim.description,
                    "details": dim.details,
                    "suggestions": dim.suggestions
                }
                for name, dim in report.dimensions.items()
            },
            "improvement_roadmap": report.improvement_roadmap
        }, indent=2)

    @staticmethod
    def format_human_readable(report: QualityReport, detailed: bool = False) -> str:
        """Format report as human-readable text"""
        lines = []
        lines.append("=" * 70)
        lines.append("SKILL QUALITY ASSESSMENT REPORT")
        lines.append("=" * 70)
        lines.append(f"Skill: {report.skill_path}")
        lines.append(f"Timestamp: {report.timestamp}")
        lines.append(f"Overall Score: {report.overall_score:.1f}/100 ({report.letter_grade})")
        lines.append(f"Recommended Tier: {report.tier_recommendation}")
        lines.append("")

        # Dimension scores
        lines.append("QUALITY DIMENSIONS:")
        for name, dimension in report.dimensions.items():
            lines.append(f"  {name}: {dimension.score:.1f}/100 ({dimension.weight * 100:.0f}% weight)")
            if detailed and dimension.details:
                for component, details in dimension.details.items():
                    lines.append(f"    • {component}: {details['score']:.1f}/{details['max_score']} - {details['details']}")
            lines.append("")

        # Summary statistics
        if report.summary_stats:
            lines.append("SUMMARY STATISTICS:")
            lines.append(f"  Highest Dimension: {report.summary_stats['highest_dimension']}")
            lines.append(f"  Lowest Dimension: {report.summary_stats['lowest_dimension']}")
            lines.append(f"  Dimensions Above 70%: {report.summary_stats['dimensions_above_70']}")
            lines.append(f"  Dimensions Below 50%: {report.summary_stats['dimensions_below_50']}")
            lines.append("")

        # Improvement roadmap
        if report.improvement_roadmap:
            lines.append("IMPROVEMENT ROADMAP:")
            for i, item in enumerate(report.improvement_roadmap[:5], 1):
                priority_symbol = "🔴" if item["priority"] == "HIGH" else "🟡" if item["priority"] == "MEDIUM" else "🟢"
                lines.append(f"  {i}. {priority_symbol} [{item['dimension']}] {item['suggestion']}")
            lines.append("")

        return "\n".join(lines)


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(
        description="Score skill quality across multiple dimensions",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python quality_scorer.py engineering/my-skill
  python quality_scorer.py engineering/my-skill --detailed --json
  python quality_scorer.py engineering/my-skill --minimum-score 75

Quality Dimensions (each 25%):
  Documentation - SKILL.md quality, README, references, examples
  Code Quality   - Script complexity, error handling, structure, output
  Completeness   - Directory structure, assets, expected outputs, tests
  Usability      - Installation simplicity, usage clarity, help accessibility

Letter Grades: A+ (95+), A (90+), A- (85+), B+ (80+), B (75+), B- (70+), C+ (65+), C (60+), C- (55+), D (50+), F (<50)
        """
    )

    parser.add_argument("skill_path",
                       help="Path to the skill directory to assess")
    parser.add_argument("--detailed",
                       action="store_true",
                       help="Show detailed component scores")
    parser.add_argument("--minimum-score",
                       type=float,
                       default=0,
                       help="Minimum acceptable score (exit with error if below)")
    parser.add_argument("--json",
                       action="store_true",
                       help="Output results in JSON format")
    parser.add_argument("--verbose",
                       action="store_true",
                       help="Enable verbose logging")

    args = parser.parse_args()

    try:
        # Create scorer and assess quality
        scorer = QualityScorer(args.skill_path, args.detailed, args.verbose)
        report = scorer.assess_quality()

        # Format and output report
        if args.json:
            print(QualityReportFormatter.format_json(report))
        else:
            print(QualityReportFormatter.format_human_readable(report, args.detailed))

        # Check minimum score requirement
        if report.overall_score < args.minimum_score:
            print(f"\nERROR: Quality score {report.overall_score:.1f} is below minimum {args.minimum_score}", file=sys.stderr)
            sys.exit(1)

        # Exit with different codes based on grade
        if report.letter_grade in ["A+", "A", "A-"]:
            sys.exit(0)  # Excellent
        elif report.letter_grade in ["B+", "B", "B-"]:
            sys.exit(0)  # Good
        elif report.letter_grade in ["C+", "C", "C-"]:
            sys.exit(0)  # Acceptable
        elif report.letter_grade == "D":
            sys.exit(2)  # Needs improvement
        else:  # F
            sys.exit(1)  # Poor quality

    except KeyboardInterrupt:
        print("\nQuality assessment interrupted by user", file=sys.stderr)
        sys.exit(130)
    except Exception as e:
        print(f"Quality assessment failed: {str(e)}", file=sys.stderr)
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()