feat(C2.5): Add inline comment extraction for Python/JS/C++

- Added comment extraction methods to code_analyzer.py - Supports Python (# style), JavaScript (// and /* */), C++ (// and /* */) - Extracts comment text, line numbers, and type (inline vs block) - Skips Python shebang and encoding declarations - Preserves TODO/FIXME/NOTE markers for developer notes Implementation: - _extract_python_comments(): Extract # comments with line tracking - _extract_js_comments(): Extract // and /* */ comments - _extract_cpp_comments(): Reuses JS logic (same syntax) - Integrated into _analyze_python(), _analyze_javascript(), _analyze_cpp() Output Format: { 'classes': [...], 'functions': [...], 'comments': [ {'line': 5, 'text': 'TODO: Optimize', 'type': 'inline'}, {'line': 12, 'text': 'Block comment\nwith lines', 'type': 'block'} ] } Tests: - Added 8 comprehensive tests to test_code_analyzer.py - Total: 30 tests passing ✅ - Python: Comment extraction, line numbers, shebang skip - JavaScript: Inline comments, block comments, mixed - C++: Comment extraction (uses JS logic) - TODO/FIXME detection test Related Issues: - Closes #67 (C2.5 Extract inline comments as notes) - Part of C2 Local Codebase Scraping roadmap (TIER 3) Files Modified: - src/skill_seekers/cli/code_analyzer.py (+67 lines) - tests/test_code_analyzer.py (+194 lines)
2026-01-01 23:02:34 +03:00
parent 43063dc0d2
commit 33d8500c44
2 changed files with 274 additions and 3 deletions
--- a/src/skill_seekers/cli/code_analyzer.py
+++ b/src/skill_seekers/cli/code_analyzer.py
@@ -131,9 +131,13 @@ class CodeAnalyzer:
                    func_sig = self._extract_python_function(node)
                    functions.append(asdict(func_sig))

+        # Extract comments
+        comments = self._extract_python_comments(content)
+
        return {
            'classes': classes,
-            'functions': functions
+            'functions': functions,
+            'comments': comments
        }

    def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
@@ -298,9 +302,13 @@ class CodeAnalyzer:
                'decorators': []
            })

+        # Extract comments
+        comments = self._extract_js_comments(content)
+
        return {
            'classes': classes,
-            'functions': functions
+            'functions': functions,
+            'comments': comments
        }

    def _extract_js_methods(self, class_body: str) -> List[Dict]:
@@ -419,9 +427,13 @@ class CodeAnalyzer:
                'decorators': []
            })

+        # Extract comments
+        comments = self._extract_cpp_comments(content)
+
        return {
            'classes': classes,
-            'functions': functions
+            'functions': functions,
+            'comments': comments
        }

    def _parse_cpp_parameters(self, params_str: str) -> List[Dict]:
@@ -463,6 +475,73 @@ class CodeAnalyzer:

        return params

+    def _extract_python_comments(self, content: str) -> List[Dict]:
+        """
+        Extract Python comments (# style).
+
+        Returns list of comment dictionaries with line number, text, and type.
+        """
+        comments = []
+
+        for i, line in enumerate(content.splitlines(), 1):
+            stripped = line.strip()
+
+            # Skip shebang and encoding declarations
+            if stripped.startswith('#!') or stripped.startswith('#') and 'coding' in stripped:
+                continue
+
+            # Extract regular comments
+            if stripped.startswith('#'):
+                comment_text = stripped[1:].strip()
+                comments.append({
+                    'line': i,
+                    'text': comment_text,
+                    'type': 'inline'
+                })
+
+        return comments
+
+    def _extract_js_comments(self, content: str) -> List[Dict]:
+        """
+        Extract JavaScript/TypeScript comments (// and /* */ styles).
+
+        Returns list of comment dictionaries with line number, text, and type.
+        """
+        comments = []
+
+        # Extract single-line comments (//)
+        for match in re.finditer(r'//(.+)$', content, re.MULTILINE):
+            line_num = content[:match.start()].count('\n') + 1
+            comment_text = match.group(1).strip()
+
+            comments.append({
+                'line': line_num,
+                'text': comment_text,
+                'type': 'inline'
+            })
+
+        # Extract multi-line comments (/* */)
+        for match in re.finditer(r'/\*(.+?)\*/', content, re.DOTALL):
+            start_line = content[:match.start()].count('\n') + 1
+            comment_text = match.group(1).strip()
+
+            comments.append({
+                'line': start_line,
+                'text': comment_text,
+                'type': 'block'
+            })
+
+        return comments
+
+    def _extract_cpp_comments(self, content: str) -> List[Dict]:
+        """
+        Extract C++ comments (// and /* */ styles, same as JavaScript).
+
+        Returns list of comment dictionaries with line number, text, and type.
+        """
+        # C++ uses the same comment syntax as JavaScript
+        return self._extract_js_comments(content)
+

 if __name__ == '__main__':
    # Test the analyzer