release: add scrapling-skill and fix script compatibility
- add scrapling-skill with validated CLI workflow, diagnostics, packaging, and docs integration - fix skill-creator package_skill.py so direct script invocation works from repo root - fix continue-claude-work extract_resume_context.py typing compatibility for local python3 - bump marketplace to 1.39.0 and updated skill versions
This commit is contained in:
@@ -5,8 +5,8 @@
|
||||
"email": "daymadev89@gmail.com"
|
||||
},
|
||||
"metadata": {
|
||||
"description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, local Claude session continuation from `.claude` artifacts, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, multi-path parallel product analysis with cross-model test-time compute scaling, real financial data collection for US equities with validation and yfinance pitfall handling, advanced Excel automation for formatted workbook generation and complex xlsm parsing, and macOS programmatic window screenshot capture workflows",
|
||||
"version": "1.38.0",
|
||||
"description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, local Claude session continuation from `.claude` artifacts, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, multi-path parallel product analysis with cross-model test-time compute scaling, real financial data collection for US equities with validation and yfinance pitfall handling, advanced Excel automation for formatted workbook generation and complex xlsm parsing, macOS programmatic window screenshot capture workflows, and verified Scrapling CLI installation and web extraction workflows",
|
||||
"version": "1.39.0",
|
||||
"homepage": "https://github.com/daymade/claude-code-skills"
|
||||
},
|
||||
"plugins": [
|
||||
@@ -15,7 +15,7 @@
|
||||
"description": "Essential meta-skill for creating effective Claude Code skills with initialization scripts, validation, packaging, marketplace registration, and privacy best practices",
|
||||
"source": "./",
|
||||
"strict": false,
|
||||
"version": "1.5.0",
|
||||
"version": "1.5.1",
|
||||
"category": "developer-tools",
|
||||
"keywords": [
|
||||
"skill-creation",
|
||||
@@ -500,7 +500,7 @@
|
||||
"description": "Develops iOS applications with XcodeGen, SwiftUI, and SPM. Use when configuring XcodeGen project.yml, resolving SPM dependency issues, deploying to devices, handling code signing, debugging camera/AVFoundation, iOS version compatibility issues, or fixing Library not loaded @rpath framework errors. Includes state machine testing patterns for @MainActor classes",
|
||||
"source": "./",
|
||||
"strict": false,
|
||||
"version": "1.1.1",
|
||||
"version": "1.1.0",
|
||||
"category": "developer-tools",
|
||||
"keywords": [
|
||||
"ios",
|
||||
@@ -565,7 +565,7 @@
|
||||
"description": "Intelligent macOS disk space analysis and cleanup with safety-first philosophy. Use when users report disk space issues, need to clean their Mac, or want to understand storage consumption. Analyzes system caches, application remnants, large files, and development environments (Docker, Homebrew, npm, pip) with risk categorization (Safe/Caution/Keep) and requires explicit user confirmation before any deletions. Includes Mole visual tool integration for hybrid workflow",
|
||||
"source": "./",
|
||||
"strict": false,
|
||||
"version": "1.1.1",
|
||||
"version": "1.1.0",
|
||||
"category": "utilities",
|
||||
"keywords": [
|
||||
"macos",
|
||||
@@ -882,7 +882,7 @@
|
||||
"description": "Recover actionable context from local `.claude` session artifacts and continue interrupted work without running `claude --resume`. Extracts compact boundary summaries, subagent workflow state, session end reason, and workspace drift via bundled Python script. Use when a user provides a Claude session ID, asks to continue prior work from local history, or wants to inspect `.claude` files before resuming implementation",
|
||||
"source": "./",
|
||||
"strict": false,
|
||||
"version": "1.1.0",
|
||||
"version": "1.1.1",
|
||||
"category": "developer-tools",
|
||||
"keywords": [
|
||||
"claude-code",
|
||||
@@ -898,6 +898,27 @@
|
||||
"skills": [
|
||||
"./continue-claude-work"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "scrapling-skill",
|
||||
"description": "Install, troubleshoot, and use Scrapling CLI for extracting HTML, Markdown, or text from webpages. Diagnoses missing extras, Playwright browser runtime issues, TLS verification failures, and WeChat public article extraction patterns. Use when users mention Scrapling, `scrapling extract`, `uv tool install scrapling`, or need to decide between static and browser-backed fetching",
|
||||
"source": "./",
|
||||
"strict": false,
|
||||
"version": "1.0.0",
|
||||
"category": "developer-tools",
|
||||
"keywords": [
|
||||
"scrapling",
|
||||
"web-scraping",
|
||||
"html",
|
||||
"markdown",
|
||||
"playwright",
|
||||
"wechat",
|
||||
"extraction",
|
||||
"cli"
|
||||
],
|
||||
"skills": [
|
||||
"./scrapling-skill"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
19
CHANGELOG.md
19
CHANGELOG.md
@@ -10,6 +10,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Added
|
||||
- None
|
||||
|
||||
## [1.39.0] - 2026-03-18
|
||||
|
||||
### Added
|
||||
- **New Skill**: scrapling-skill v1.0.0 - Reliable Scrapling CLI installation, troubleshooting, and extraction workflows for HTML, Markdown, and text output
|
||||
- Bundled `diagnose_scrapling.py` script to verify CLI health, detect missing extras, inspect Playwright browser runtime, and run real smoke tests
|
||||
- Static-first workflow for choosing between `extract get`, `extract fetch`, and `stealthy-fetch`
|
||||
- Verified WeChat public article extraction pattern using `#js_content`
|
||||
- Verified recovery path for local TLS trust-store failures via `--no-verify`
|
||||
- Bundled troubleshooting reference covering extras, browser runtime, and output validation
|
||||
|
||||
### Changed
|
||||
- **skill-creator** v1.5.0 → v1.5.1: Fixed `scripts/package_skill.py` so it works when invoked directly from the repository root instead of only via `python -m`
|
||||
- **continue-claude-work** v1.1.0 → v1.1.1: Replaced newer Python-only type syntax in `extract_resume_context.py` so the script runs under the local `python3` environment
|
||||
- Updated marketplace skills/plugins count from 42 to 43
|
||||
- Updated marketplace version from 1.38.0 to 1.39.0
|
||||
- Updated marketplace metadata description to include Scrapling CLI extraction workflows
|
||||
- Updated README.md and README.zh-CN.md badges, installation commands, skill listings, use cases, quick links, and requirements
|
||||
- Updated CLAUDE.md counts, version reference, and Available Skills list (added #43)
|
||||
|
||||
## [1.38.0] - 2026-03-07
|
||||
|
||||
### Added
|
||||
|
||||
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
||||
|
||||
## Repository Overview
|
||||
|
||||
This is a Claude Code skills marketplace containing 42 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources.
|
||||
This is a Claude Code skills marketplace containing 43 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources.
|
||||
|
||||
**Essential Skill**: `skill-creator` is the most important skill in this marketplace - it's a meta-skill that enables users to create their own skills. Always recommend it first for users interested in extending Claude Code.
|
||||
|
||||
@@ -134,7 +134,7 @@ Skills for public distribution must NOT contain:
|
||||
## Marketplace Configuration
|
||||
|
||||
The marketplace is configured in `.claude-plugin/marketplace.json`:
|
||||
- Contains 42 plugins, each mapping to one skill
|
||||
- Contains 43 plugins, each mapping to one skill
|
||||
- Each plugin has: name, description, version, category, keywords, skills array
|
||||
- Marketplace metadata: name, owner, version, homepage
|
||||
|
||||
@@ -144,7 +144,7 @@ The marketplace is configured in `.claude-plugin/marketplace.json`:
|
||||
|
||||
1. **Marketplace Version** (`.claude-plugin/marketplace.json` → `metadata.version`)
|
||||
- Tracks the marketplace catalog as a whole
|
||||
- Current: v1.38.0
|
||||
- Current: v1.39.0
|
||||
- Bump when: Adding/removing skills, major marketplace restructuring
|
||||
- Semantic versioning: MAJOR.MINOR.PATCH
|
||||
|
||||
@@ -219,6 +219,7 @@ This applies when you change ANY file under a skill directory:
|
||||
40. **excel-automation** - Create formatted Excel files, parse complex xlsm models, and control Excel windows on macOS via AppleScript
|
||||
41. **capture-screen** - Programmatically capture macOS application windows using Swift window ID discovery and screencapture workflows
|
||||
42. **continue-claude-work** - Recover local `.claude` session context via compact-boundary extraction, subagent workflow recovery, and session end reason detection, then continue interrupted work without `claude --resume`
|
||||
43. **scrapling-skill** - Install, troubleshoot, and use Scrapling CLI for static/dynamic web extraction, WeChat article capture, and verified output validation
|
||||
|
||||
**Recommendation**: Always suggest `skill-creator` first for users interested in creating skills or extending Claude Code.
|
||||
|
||||
|
||||
52
README.md
52
README.md
@@ -6,15 +6,15 @@
|
||||
[](./README.zh-CN.md)
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://claude.com/code)
|
||||
[](./CONTRIBUTING.md)
|
||||
[](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
|
||||
|
||||
</div>
|
||||
|
||||
Professional Claude Code skills marketplace featuring 42 production-ready skills for enhanced development workflows.
|
||||
Professional Claude Code skills marketplace featuring 43 production-ready skills for enhanced development workflows.
|
||||
|
||||
## 📑 Table of Contents
|
||||
|
||||
@@ -240,6 +240,9 @@ claude plugin install capture-screen@daymade-skills
|
||||
|
||||
# Resume interrupted Claude work from local session artifacts
|
||||
claude plugin install continue-claude-work@daymade-skills
|
||||
|
||||
# Scrapling CLI extraction and troubleshooting
|
||||
claude plugin install scrapling-skill@daymade-skills
|
||||
```
|
||||
|
||||
Each skill can be installed independently - choose only what you need!
|
||||
@@ -1787,6 +1790,44 @@ claude plugin install continue-claude-work@daymade-skills
|
||||
|
||||
---
|
||||
|
||||
### 43. **scrapling-skill** - Reliable Scrapling CLI Workflows
|
||||
|
||||
Install, troubleshoot, and use Scrapling CLI with a verified static-first workflow for extracting HTML, Markdown, or text from webpages. Includes a diagnostic script for broken extras installs, Playwright browser runtime checks, and smoke tests against real URLs.
|
||||
|
||||
**When to use:**
|
||||
- Users mention Scrapling, `uv tool install scrapling`, or `scrapling extract`
|
||||
- You need to choose between static and browser-backed fetching
|
||||
- You need to extract article bodies from WeChat public pages (`mp.weixin.qq.com`)
|
||||
- A Scrapling install works partially but fails on missing extras, browser runtime, or TLS verification
|
||||
|
||||
**Key features:**
|
||||
- Bundled `diagnose_scrapling.py` script for CLI, browser runtime, and live URL smoke tests
|
||||
- Verified default path: start with `extract get`, escalate to `extract fetch` only when needed
|
||||
- WeChat extraction pattern using `#js_content` for clean article Markdown
|
||||
- Troubleshooting guidance for missing `click`, Playwright runtime setup, and `curl: (60)` trust-store failures
|
||||
- Output validation workflow using file size and content checks instead of exit-code assumptions
|
||||
|
||||
**Example usage:**
|
||||
```bash
|
||||
# Install the skill
|
||||
claude plugin install scrapling-skill@daymade-skills
|
||||
|
||||
# Then ask Claude to work through Scrapling for you
|
||||
"Install Scrapling CLI and verify the setup"
|
||||
"Extract this WeChat article into Markdown with Scrapling"
|
||||
"Decide whether this page needs static or browser-backed fetching"
|
||||
```
|
||||
|
||||
**🎬 Live Demo**
|
||||
|
||||
*Coming soon*
|
||||
|
||||
📚 **Documentation**: See [scrapling-skill/SKILL.md](./scrapling-skill/SKILL.md) and [scrapling-skill/references/troubleshooting.md](./scrapling-skill/references/troubleshooting.md).
|
||||
|
||||
**Requirements**: Python 3.6+, `uv`, Scrapling CLI, and Playwright browser runtime for browser-backed fetches.
|
||||
|
||||
---
|
||||
|
||||
## 🎬 Interactive Demo Gallery
|
||||
|
||||
Want to see all demos in one place with click-to-enlarge functionality? Check out our [interactive demo gallery](./demos/index.html) or browse the [demos directory](./demos/).
|
||||
@@ -1853,6 +1894,9 @@ Use **claude-code-history-files-finder** to recover deleted files from previous
|
||||
### For Resuming Interrupted Claude Sessions
|
||||
Use **continue-claude-work** to recover the last actionable request from local `~/.claude` artifacts and continue implementation without reopening the original session. Combine with **claude-code-history-files-finder** when you need broader cross-session search, statistics, or deleted-file recovery.
|
||||
|
||||
### For Web Extraction & WeChat Articles
|
||||
Use **scrapling-skill** to install and validate Scrapling CLI, choose between static and browser-backed fetching, and extract clean Markdown from sites like `mp.weixin.qq.com`. Combine with **deep-research** to turn extracted sources into structured reports or with **docs-cleaner** to normalize captured article content.
|
||||
|
||||
### For Documentation Maintenance
|
||||
Use **docs-cleaner** to consolidate redundant documentation while preserving valuable content. Perfect for cleaning up documentation sprawl after rapid development phases or merging overlapping docs into authoritative sources.
|
||||
|
||||
@@ -1941,6 +1985,7 @@ Each skill includes:
|
||||
- **excel-automation**: See `excel-automation/SKILL.md` for create/parse/control workflows and `excel-automation/references/formatting-reference.md` for formatting standards
|
||||
- **capture-screen**: See `capture-screen/SKILL.md` for CGWindowID-based screenshot workflows on macOS
|
||||
- **continue-claude-work**: See `continue-claude-work/SKILL.md` for local artifact recovery, drift checks, and resume workflow
|
||||
- **scrapling-skill**: See `scrapling-skill/SKILL.md` for the CLI workflow and `scrapling-skill/references/troubleshooting.md` for verified Scrapling failure modes
|
||||
|
||||
## 🛠️ Requirements
|
||||
|
||||
@@ -1967,6 +2012,7 @@ Each skill includes:
|
||||
- **uv + openpyxl** (for excel-automation): `uv run --with openpyxl ...`
|
||||
- **macOS** (for capture-screen and excel-automation AppleScript control workflows)
|
||||
- **Python 3.8+** (for continue-claude-work): bundled script for session extraction (no external dependencies)
|
||||
- **uv + Scrapling CLI** (for scrapling-skill): `uv tool install 'scrapling[shell]'` and `scrapling install` for browser-backed fetches
|
||||
|
||||
## ❓ FAQ
|
||||
|
||||
|
||||
@@ -6,15 +6,15 @@
|
||||
[](./README.zh-CN.md)
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://github.com/daymade/claude-code-skills)
|
||||
[](https://claude.com/code)
|
||||
[](./CONTRIBUTING.md)
|
||||
[](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
|
||||
|
||||
</div>
|
||||
|
||||
专业的 Claude Code 技能市场,提供 42 个生产就绪的技能,用于增强开发工作流。
|
||||
专业的 Claude Code 技能市场,提供 43 个生产就绪的技能,用于增强开发工作流。
|
||||
|
||||
## 📑 目录
|
||||
|
||||
@@ -243,6 +243,9 @@ claude plugin install capture-screen@daymade-skills
|
||||
|
||||
# 基于本地会话产物续做中断的 Claude 工作
|
||||
claude plugin install continue-claude-work@daymade-skills
|
||||
|
||||
# Scrapling CLI 抽取与故障排查
|
||||
claude plugin install scrapling-skill@daymade-skills
|
||||
```
|
||||
|
||||
每个技能都可以独立安装 - 只选择你需要的!
|
||||
@@ -1829,6 +1832,44 @@ claude plugin install continue-claude-work@daymade-skills
|
||||
|
||||
---
|
||||
|
||||
### 43. **scrapling-skill** - 可靠的 Scrapling CLI 工作流
|
||||
|
||||
围绕 Scrapling CLI 提供经过验证的安装、排障与网页抽取工作流,用于从网页输出 HTML、Markdown 或纯文本。内置诊断脚本,可检查 extras 安装问题、Playwright 浏览器运行时,以及真实 URL 的烟测结果。
|
||||
|
||||
**使用场景:**
|
||||
- 用户提到 Scrapling、`uv tool install scrapling` 或 `scrapling extract`
|
||||
- 需要判断应该使用静态抓取还是浏览器抓取
|
||||
- 需要从微信公众号页面(`mp.weixin.qq.com`)提取正文
|
||||
- Scrapling 安装看似成功,但在 extras、浏览器运行时或 TLS 校验上失败
|
||||
|
||||
**主要功能:**
|
||||
- 内置 `diagnose_scrapling.py`,检查 CLI、浏览器运行时与真实 URL 烟测
|
||||
- 经过验证的默认路径:先用 `extract get`,只有必要时再升级到 `extract fetch`
|
||||
- 针对微信公众号文章的 `#js_content` 提取模式
|
||||
- 覆盖缺少 `click`、Playwright 运行时缺失、`curl: (60)` 证书问题等真实故障
|
||||
- 用文件大小和内容验证结果,而不是只看退出码
|
||||
|
||||
**示例用法:**
|
||||
```bash
|
||||
# 安装技能
|
||||
claude plugin install scrapling-skill@daymade-skills
|
||||
|
||||
# 然后让 Claude 代你跑 Scrapling
|
||||
"安装 Scrapling CLI 并验证配置"
|
||||
"用 Scrapling 把这篇微信公众号文章提取成 Markdown"
|
||||
"判断这个页面应不应该走浏览器抓取"
|
||||
```
|
||||
|
||||
**🎬 实时演示**
|
||||
|
||||
*即将推出*
|
||||
|
||||
📚 **文档**:参见 [scrapling-skill/SKILL.md](./scrapling-skill/SKILL.md) 和 [scrapling-skill/references/troubleshooting.md](./scrapling-skill/references/troubleshooting.md)。
|
||||
|
||||
**要求**:Python 3.6+、`uv`、Scrapling CLI;如需浏览器抓取,还需要 Playwright 浏览器运行时。
|
||||
|
||||
---
|
||||
|
||||
## 🎬 交互式演示画廊
|
||||
|
||||
想要在一个地方查看所有演示并具有点击放大功能?访问我们的[交互式演示画廊](./demos/index.html)或浏览[演示目录](./demos/)。
|
||||
@@ -1895,6 +1936,9 @@ claude plugin install continue-claude-work@daymade-skills
|
||||
### 续做中断的 Claude 会话
|
||||
使用 **continue-claude-work** 从本地 `~/.claude` 产物中恢复最后一个可执行请求,并在不重新打开原始会话的情况下继续实现。若还需要跨会话搜索、统计分析或恢复已删除文件,可与 **claude-code-history-files-finder** 配合使用。
|
||||
|
||||
### 网页提取与微信公众号文章
|
||||
使用 **scrapling-skill** 安装并验证 Scrapling CLI,判断应使用静态抓取还是浏览器抓取,并从 `mp.weixin.qq.com` 等页面提取干净的 Markdown。可与 **deep-research** 配合,将抓取内容整理为结构化报告,或与 **docs-cleaner** 配合清理抽取后的文章内容。
|
||||
|
||||
### 文档维护
|
||||
使用 **docs-cleaner** 在保留有价值内容的同时整合冗余文档。非常适合在快速开发阶段后清理文档扩散或将重叠的文档合并为权威来源。
|
||||
|
||||
@@ -1983,6 +2027,7 @@ claude plugin install continue-claude-work@daymade-skills
|
||||
- **excel-automation**:参见 `excel-automation/SKILL.md` 了解创建/解析/控制工作流,参见 `excel-automation/references/formatting-reference.md` 了解格式规范
|
||||
- **capture-screen**:参见 `capture-screen/SKILL.md` 了解基于 CGWindowID 的 macOS 截图流程
|
||||
- **continue-claude-work**:参见 `continue-claude-work/SKILL.md` 了解本地会话产物恢复、漂移检查与续做流程
|
||||
- **scrapling-skill**:参见 `scrapling-skill/SKILL.md` 了解 CLI 工作流,参见 `scrapling-skill/references/troubleshooting.md` 了解已验证的 Scrapling 故障模式
|
||||
|
||||
## 🛠️ 系统要求
|
||||
|
||||
@@ -2006,6 +2051,7 @@ claude plugin install continue-claude-work@daymade-skills
|
||||
- **uv + openpyxl**(用于 excel-automation):`uv run --with openpyxl ...`
|
||||
- **macOS**(用于 capture-screen 与 excel-automation 的 AppleScript 控制流程)
|
||||
- **Python 3.8+**(用于 continue-claude-work):内置脚本进行会话提取(无外部依赖)
|
||||
- **uv + Scrapling CLI**(用于 scrapling-skill):`uv tool install 'scrapling[shell]'`,浏览器抓取前运行 `scrapling install`
|
||||
|
||||
## ❓ 常见问题
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Security scan passed
|
||||
Scanned at: 2026-03-07T14:27:12.638956
|
||||
Scanned at: 2026-03-18T23:02:18.627209
|
||||
Tool: gitleaks + pattern-based validation
|
||||
Content hash: c464aa735e8b7832c2c77e4cea22fff9c7e6117ecee4f6769f5eb62cced8a11a
|
||||
Content hash: 62e456422cabfe74e5757a802044dac45d1662341b2e90dc943685db81d8f659
|
||||
|
||||
@@ -37,6 +37,7 @@ import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
CLAUDE_DIR = Path.home() / ".claude"
|
||||
PROJECTS_DIR = CLAUDE_DIR / "projects"
|
||||
@@ -59,7 +60,7 @@ def normalize_path(project_path: str) -> str:
|
||||
return project_path.replace("/", "-")
|
||||
|
||||
|
||||
def find_project_dir(project_path: str) -> Path | None:
|
||||
def find_project_dir(project_path: str) -> Optional[Path]:
|
||||
"""Find the Claude projects directory for a given project path."""
|
||||
abs_path = os.path.abspath(project_path)
|
||||
|
||||
@@ -85,7 +86,7 @@ def find_project_dir(project_path: str) -> Path | None:
|
||||
return None
|
||||
|
||||
|
||||
def load_sessions_index(project_dir: Path) -> list[dict]:
|
||||
def load_sessions_index(project_dir: Path) -> List[Dict]:
|
||||
"""Load and parse sessions-index.json, sorted by modified desc."""
|
||||
index_file = project_dir / "sessions-index.json"
|
||||
if not index_file.exists():
|
||||
@@ -97,7 +98,7 @@ def load_sessions_index(project_dir: Path) -> list[dict]:
|
||||
return entries
|
||||
|
||||
|
||||
def search_sessions(entries: list[dict], query: str) -> list[dict]:
|
||||
def search_sessions(entries: List[Dict], query: str) -> List[Dict]:
|
||||
"""Search sessions by keyword in firstPrompt and summary."""
|
||||
query_lower = query.lower()
|
||||
results = []
|
||||
@@ -109,7 +110,7 @@ def search_sessions(entries: list[dict], query: str) -> list[dict]:
|
||||
return results
|
||||
|
||||
|
||||
def format_session_entry(entry: dict, file_exists: bool = True) -> str:
|
||||
def format_session_entry(entry: Dict, file_exists: bool = True) -> str:
|
||||
"""Format a session index entry for display."""
|
||||
sid = entry.get("sessionId", "?")
|
||||
modified = entry.get("modified", "?")
|
||||
@@ -123,7 +124,7 @@ def format_session_entry(entry: dict, file_exists: bool = True) -> str:
|
||||
# ── Session file parsing ────────────────────────────────────────────
|
||||
|
||||
|
||||
def parse_session_structure(session_file: Path) -> dict:
|
||||
def parse_session_structure(session_file: Path) -> Dict:
|
||||
"""Parse a session JSONL file and return structured data."""
|
||||
file_size = session_file.stat().st_size
|
||||
total_lines = 0
|
||||
@@ -276,8 +277,8 @@ def parse_session_structure(session_file: Path) -> dict:
|
||||
|
||||
|
||||
def _detect_end_reason(
|
||||
last_role: str | None,
|
||||
unresolved: dict,
|
||||
last_role: Optional[str],
|
||||
unresolved: Dict,
|
||||
error_count: int,
|
||||
) -> str:
|
||||
"""Detect why the session ended."""
|
||||
@@ -300,7 +301,7 @@ def _is_noise_user_text(text: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def extract_user_text(messages: list[dict], limit: int = 5) -> list[str]:
|
||||
def extract_user_text(messages: List[Dict], limit: int = 5) -> List[str]:
|
||||
"""Extract the last N user text messages (not tool results or system noise)."""
|
||||
user_texts = []
|
||||
for msg_obj in reversed(messages):
|
||||
@@ -329,7 +330,7 @@ def extract_user_text(messages: list[dict], limit: int = 5) -> list[str]:
|
||||
return user_texts
|
||||
|
||||
|
||||
def extract_assistant_text(messages: list[dict], limit: int = 3) -> list[str]:
|
||||
def extract_assistant_text(messages: List[Dict], limit: int = 3) -> List[str]:
|
||||
"""Extract the last N assistant text responses (no thinking/tool_use)."""
|
||||
assistant_texts = []
|
||||
for msg_obj in reversed(messages):
|
||||
@@ -355,7 +356,7 @@ def extract_assistant_text(messages: list[dict], limit: int = 3) -> list[str]:
|
||||
# ── Subagent extraction ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_subagent_context(session_file: Path) -> list[dict]:
|
||||
def extract_subagent_context(session_file: Path) -> List[Dict]:
|
||||
"""Extract subagent summaries from session subdirectories.
|
||||
|
||||
Returns list of {name, type, status, last_text, is_interrupted}.
|
||||
@@ -457,7 +458,8 @@ def get_git_state(project_path: str) -> str:
|
||||
try:
|
||||
branch = subprocess.run(
|
||||
["git", "branch", "--show-current"],
|
||||
capture_output=True, text=True, cwd=project_path, timeout=5,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
universal_newlines=True, cwd=project_path, timeout=5,
|
||||
)
|
||||
if branch.stdout.strip():
|
||||
parts.append(f"**Current branch**: `{branch.stdout.strip()}`")
|
||||
@@ -467,7 +469,8 @@ def get_git_state(project_path: str) -> str:
|
||||
try:
|
||||
status = subprocess.run(
|
||||
["git", "status", "--short"],
|
||||
capture_output=True, text=True, cwd=project_path, timeout=10,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
universal_newlines=True, cwd=project_path, timeout=10,
|
||||
)
|
||||
if status.stdout.strip():
|
||||
parts.append(f"### git status\n```\n{status.stdout.strip()}\n```")
|
||||
@@ -479,7 +482,8 @@ def get_git_state(project_path: str) -> str:
|
||||
try:
|
||||
log = subprocess.run(
|
||||
["git", "log", "--oneline", "-5"],
|
||||
capture_output=True, text=True, cwd=project_path, timeout=10,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
universal_newlines=True, cwd=project_path, timeout=10,
|
||||
)
|
||||
if log.stdout.strip():
|
||||
parts.append(f"### git log (last 5)\n```\n{log.stdout.strip()}\n```")
|
||||
@@ -489,7 +493,7 @@ def get_git_state(project_path: str) -> str:
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def get_memory_md(project_dir: Path) -> str | None:
|
||||
def get_memory_md(project_dir: Path) -> Optional[str]:
|
||||
"""Read MEMORY.md if it exists in the project's memory directory."""
|
||||
memory_dir = project_dir / "memory"
|
||||
memory_file = memory_dir / "MEMORY.md"
|
||||
@@ -500,7 +504,7 @@ def get_memory_md(project_dir: Path) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def get_session_memory(session_file: Path) -> str | None:
|
||||
def get_session_memory(session_file: Path) -> Optional[str]:
|
||||
"""Read session-memory/summary.md if it exists (newer CC versions)."""
|
||||
session_dir = session_file.parent / session_file.stem
|
||||
summary = session_dir / "session-memory" / "summary.md"
|
||||
@@ -524,8 +528,8 @@ END_REASON_LABELS = {
|
||||
|
||||
|
||||
def build_briefing(
|
||||
session_entry: dict | None,
|
||||
parsed: dict,
|
||||
session_entry: Optional[Dict],
|
||||
parsed: Dict,
|
||||
project_path: str,
|
||||
project_dir: Path,
|
||||
session_file: Path,
|
||||
@@ -658,7 +662,7 @@ def build_briefing(
|
||||
# ── CLI ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _check_session_files(entries: list[dict], project_dir: Path) -> dict[str, bool]:
|
||||
def _check_session_files(entries: List[Dict], project_dir: Path) -> Dict[str, bool]:
|
||||
"""Check which index entries have actual files on disk."""
|
||||
status = {}
|
||||
for entry in entries:
|
||||
|
||||
BIN
scrapling-skill.skill
Normal file
BIN
scrapling-skill.skill
Normal file
Binary file not shown.
4
scrapling-skill/.security-scan-passed
Normal file
4
scrapling-skill/.security-scan-passed
Normal file
@@ -0,0 +1,4 @@
|
||||
Security scan passed
|
||||
Scanned at: 2026-03-18T22:52:43.734452
|
||||
Tool: gitleaks + pattern-based validation
|
||||
Content hash: 06351e5794510c584fdf29351eb5161f4b12e213f512c3148212c82c357d124a
|
||||
183
scrapling-skill/SKILL.md
Normal file
183
scrapling-skill/SKILL.md
Normal file
@@ -0,0 +1,183 @@
|
||||
---
|
||||
name: scrapling-skill
|
||||
description: Install, troubleshoot, and use Scrapling CLI to extract HTML, Markdown, or text from webpages. Use this skill whenever the user mentions Scrapling, `uv tool install scrapling`, `scrapling extract`, WeChat/mp.weixin articles, browser-backed page fetching, or needs help deciding between static and dynamic extraction.
|
||||
---
|
||||
|
||||
# Scrapling Skill
|
||||
|
||||
## Overview
|
||||
|
||||
Use Scrapling through its CLI as the default path. Start with the smallest working command, validate the saved output, and only escalate to browser-backed fetching when the static fetch does not contain the real page content.
|
||||
|
||||
Do not assume the user's Scrapling install is healthy. Verify it first.
|
||||
|
||||
## Default Workflow
|
||||
|
||||
Copy this checklist and keep it updated while working:
|
||||
|
||||
```text
|
||||
Scrapling Progress:
|
||||
- [ ] Step 1: Diagnose the local Scrapling install
|
||||
- [ ] Step 2: Fix CLI extras or browser runtime if needed
|
||||
- [ ] Step 3: Choose static or dynamic fetch
|
||||
- [ ] Step 4: Save output to a file
|
||||
- [ ] Step 5: Validate file size and extracted content
|
||||
- [ ] Step 6: Escalate only if the previous path failed
|
||||
```
|
||||
|
||||
## Step 1: Diagnose the Install
|
||||
|
||||
Run the bundled diagnostic script first:
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py
|
||||
```
|
||||
|
||||
Use the result as the source of truth for the next step.
|
||||
|
||||
## Step 2: Fix the Install
|
||||
|
||||
### If the CLI was installed without extras
|
||||
|
||||
If `scrapling --help` fails with missing `click` or a message about installing Scrapling with extras, reinstall it with the CLI extra:
|
||||
|
||||
```bash
|
||||
uv tool uninstall scrapling
|
||||
uv tool install 'scrapling[shell]'
|
||||
```
|
||||
|
||||
Do not default to `scrapling[all]` unless the user explicitly needs the broader feature set.
|
||||
|
||||
### If browser-backed fetchers are needed
|
||||
|
||||
Install the Playwright runtime:
|
||||
|
||||
```bash
|
||||
scrapling install
|
||||
```
|
||||
|
||||
If the install looks slow or opaque, read `references/troubleshooting.md` before guessing. Do not claim success until either:
|
||||
- `scrapling install` reports that dependencies are already installed, or
|
||||
- the diagnostic script confirms both Chromium and Chrome Headless Shell are present.
|
||||
|
||||
## Step 3: Choose the Fetcher
|
||||
|
||||
Use this decision rule:
|
||||
|
||||
- Start with `extract get` for normal pages, article pages, and most WeChat public articles.
|
||||
- Use `extract fetch` when the static HTML does not contain the real content or the page depends on JavaScript rendering.
|
||||
- Use `extract stealthy-fetch` only after `fetch` still fails because of anti-bot or challenge behavior. Do not make it the default.
|
||||
|
||||
## Step 4: Run the Smallest Useful Command
|
||||
|
||||
Always quote URLs in shell commands. This is mandatory in `zsh` when the URL contains `?`, `&`, or other special characters.
|
||||
|
||||
### Full page to HTML
|
||||
|
||||
```bash
|
||||
scrapling extract get 'https://example.com' page.html
|
||||
```
|
||||
|
||||
### Main content to Markdown
|
||||
|
||||
```bash
|
||||
scrapling extract get 'https://example.com' article.md -s 'main'
|
||||
```
|
||||
|
||||
### JS-rendered page with browser automation
|
||||
|
||||
```bash
|
||||
scrapling extract fetch 'https://example.com' page.html --timeout 20000
|
||||
```
|
||||
|
||||
### WeChat public article body
|
||||
|
||||
Use `#js_content` first. This is the default selector for article body extraction on `mp.weixin.qq.com` pages.
|
||||
|
||||
```bash
|
||||
scrapling extract get 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' article.md -s '#js_content'
|
||||
```
|
||||
|
||||
## Step 5: Validate the Output
|
||||
|
||||
After every extraction, verify the file instead of assuming success:
|
||||
|
||||
```bash
|
||||
wc -c article.md
|
||||
sed -n '1,40p' article.md
|
||||
```
|
||||
|
||||
For HTML output, check that the expected title, container, or selector target is actually present:
|
||||
|
||||
```bash
|
||||
rg -n '<title>|js_content|rich_media_title|main' page.html
|
||||
```
|
||||
|
||||
If the file is tiny, empty, or missing the expected container, the extraction did not succeed. Go back to Step 3 and switch fetchers or selectors.
|
||||
|
||||
## Step 6: Handle Known Failure Modes
|
||||
|
||||
### Local TLS trust store problem
|
||||
|
||||
If `extract get` fails with `curl: (60) SSL certificate problem`, treat it as a local trust-store problem first, not a Scrapling content failure.
|
||||
|
||||
Retry the same command with:
|
||||
|
||||
```bash
|
||||
--no-verify
|
||||
```
|
||||
|
||||
Only do this after confirming the failure matches the local certificate verification error pattern. Do not silently disable verification by default.
|
||||
|
||||
### WeChat article pages
|
||||
|
||||
For `mp.weixin.qq.com`:
|
||||
- Try `extract get` before `extract fetch`
|
||||
- Use `-s '#js_content'` for the article body
|
||||
- Validate the saved Markdown or HTML immediately
|
||||
|
||||
### Browser-backed fetch failures
|
||||
|
||||
If `extract fetch` fails:
|
||||
1. Re-check the install with `python3 scripts/diagnose_scrapling.py`
|
||||
2. Confirm Chromium and Chrome Headless Shell are present
|
||||
3. Retry with a slightly longer timeout
|
||||
4. Escalate to `stealthy-fetch` only if the site behavior justifies it
|
||||
|
||||
## Command Patterns
|
||||
|
||||
### Diagnose and smoke test a URL
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py --url 'https://example.com'
|
||||
```
|
||||
|
||||
### Diagnose and smoke test a WeChat article body
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py \
|
||||
--url 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' \
|
||||
--selector '#js_content' \
|
||||
--no-verify
|
||||
```
|
||||
|
||||
### Diagnose and smoke test a browser-backed fetch
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py \
|
||||
--url 'https://example.com' \
|
||||
--dynamic
|
||||
```
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Do not tell the user to reinstall blindly. Verify first.
|
||||
- Do not default to the Python library API when the user is clearly asking about the CLI.
|
||||
- Do not jump to browser-backed fetching unless the static result is missing the real content.
|
||||
- Do not claim success from exit code alone. Inspect the saved file.
|
||||
- Do not hardcode user-specific absolute paths into outputs or docs.
|
||||
|
||||
## Resources
|
||||
|
||||
- Installation and smoke test helper: `scripts/diagnose_scrapling.py`
|
||||
- Verified failure modes and recovery paths: `references/troubleshooting.md`
|
||||
164
scrapling-skill/references/troubleshooting.md
Normal file
164
scrapling-skill/references/troubleshooting.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Scrapling Troubleshooting
|
||||
|
||||
## Contents
|
||||
|
||||
- Installation modes
|
||||
- Verified failure modes
|
||||
- Static vs dynamic fetch choice
|
||||
- WeChat extraction pattern
|
||||
- Smoke test commands
|
||||
|
||||
## Installation Modes
|
||||
|
||||
Use the CLI path as the default:
|
||||
|
||||
```bash
|
||||
uv tool install 'scrapling[shell]'
|
||||
```
|
||||
|
||||
Do not assume `uv tool install scrapling` is enough for CLI usage. The base package may install the executable wrapper without the optional CLI dependencies.
|
||||
|
||||
## Verified Failure Modes
|
||||
|
||||
### 1. CLI installed without extras
|
||||
|
||||
Symptom:
|
||||
|
||||
- `scrapling --help` fails
|
||||
- Output mentions missing `click`
|
||||
- Output says Scrapling must be installed with extras
|
||||
|
||||
Recovery:
|
||||
|
||||
```bash
|
||||
uv tool uninstall scrapling
|
||||
uv tool install 'scrapling[shell]'
|
||||
```
|
||||
|
||||
### 2. Browser-backed fetchers not ready
|
||||
|
||||
Symptom:
|
||||
|
||||
- `extract fetch` or `extract stealthy-fetch` fails because the Playwright runtime is not installed
|
||||
- Scrapling has not downloaded Chromium or Chrome Headless Shell
|
||||
|
||||
Recovery:
|
||||
|
||||
```bash
|
||||
scrapling install
|
||||
```
|
||||
|
||||
Success signals:
|
||||
|
||||
- `scrapling install` later reports `The dependencies are already installed`
|
||||
- Browser caches contain both:
|
||||
- `chromium-*`
|
||||
- `chromium_headless_shell-*`
|
||||
|
||||
Typical cache roots:
|
||||
|
||||
- `~/Library/Caches/ms-playwright/`
|
||||
- `~/.cache/ms-playwright/`
|
||||
|
||||
### 3. Static fetch TLS trust-store failure
|
||||
|
||||
Symptom:
|
||||
|
||||
- `extract get` fails with `curl: (60) SSL certificate problem`
|
||||
|
||||
Interpretation:
|
||||
|
||||
- Treat this as a local certificate verification problem first
|
||||
- Do not assume the target URL or Scrapling itself is broken
|
||||
|
||||
Recovery:
|
||||
|
||||
Retry the same static command with:
|
||||
|
||||
```bash
|
||||
--no-verify
|
||||
```
|
||||
|
||||
Do not make `--no-verify` the default. Use it only after the failure matches this certificate-verification pattern.
|
||||
|
||||
## Static vs Dynamic Fetch Choice
|
||||
|
||||
Use this order:
|
||||
|
||||
1. `extract get`
|
||||
2. `extract fetch`
|
||||
3. `extract stealthy-fetch`
|
||||
|
||||
Use `extract get` when:
|
||||
|
||||
- The page is mostly server-rendered
|
||||
- The content is likely already present in raw HTML
|
||||
- The target is an article page with a stable content container
|
||||
|
||||
Use `extract fetch` when:
|
||||
|
||||
- Static HTML does not contain the real content
|
||||
- The site depends on JavaScript rendering
|
||||
- The page content appears only after runtime hydration
|
||||
|
||||
Use `extract stealthy-fetch` when:
|
||||
|
||||
- `fetch` still fails
|
||||
- The target site shows challenge or anti-bot behavior
|
||||
|
||||
## WeChat Extraction Pattern
|
||||
|
||||
For `mp.weixin.qq.com` public article pages:
|
||||
|
||||
- Start with `extract get`
|
||||
- Use the selector `#js_content`
|
||||
- Validate the saved file immediately
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
scrapling extract get 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' article.md -s '#js_content'
|
||||
```
|
||||
|
||||
Observed behavior:
|
||||
|
||||
- The static fetch can already contain the real article body
|
||||
- Browser-backed fetch is often unnecessary for article extraction
|
||||
|
||||
## Smoke Test Commands
|
||||
|
||||
### Basic diagnosis
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py
|
||||
```
|
||||
|
||||
### Static extraction smoke test
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py --url 'https://example.com'
|
||||
```
|
||||
|
||||
### WeChat article smoke test
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py \
|
||||
--url 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' \
|
||||
--selector '#js_content'
|
||||
```
|
||||
|
||||
### Dynamic extraction smoke test
|
||||
|
||||
```bash
|
||||
python3 scripts/diagnose_scrapling.py \
|
||||
--url 'https://example.com' \
|
||||
--dynamic
|
||||
```
|
||||
|
||||
### Validate saved output
|
||||
|
||||
```bash
|
||||
wc -c article.md
|
||||
sed -n '1,40p' article.md
|
||||
rg -n '<title>|js_content|main|rich_media_title' page.html
|
||||
```
|
||||
191
scrapling-skill/scripts/diagnose_scrapling.py
Executable file
191
scrapling-skill/scripts/diagnose_scrapling.py
Executable file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diagnose a local Scrapling CLI installation and optionally run a smoke test.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
|
||||
def run_command(cmd: List[str]) -> Tuple[int, str, str]:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True,
|
||||
check=False,
|
||||
)
|
||||
return result.returncode, result.stdout, result.stderr
|
||||
|
||||
|
||||
def print_section(title: str) -> None:
|
||||
print("")
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
|
||||
|
||||
def existing_dirs(paths: Iterable[Path]) -> List[str]:
|
||||
return [str(path) for path in paths if path.exists()]
|
||||
|
||||
|
||||
def detect_browser_cache() -> Tuple[List[str], List[str]]:
|
||||
roots = [
|
||||
Path.home() / "Library" / "Caches" / "ms-playwright",
|
||||
Path.home() / ".cache" / "ms-playwright",
|
||||
]
|
||||
chromium = []
|
||||
headless_shell = []
|
||||
for root in roots:
|
||||
if not root.exists():
|
||||
continue
|
||||
chromium.extend(existing_dirs(sorted(root.glob("chromium-*"))))
|
||||
headless_shell.extend(existing_dirs(sorted(root.glob("chromium_headless_shell-*"))))
|
||||
return chromium, headless_shell
|
||||
|
||||
|
||||
def diagnose_cli() -> bool:
|
||||
print_section("CLI")
|
||||
scrapling_path = shutil.which("scrapling")
|
||||
if not scrapling_path:
|
||||
print("status: missing")
|
||||
print("fix: install with `uv tool install 'scrapling[shell]'`")
|
||||
return False
|
||||
|
||||
print("path: {0}".format(scrapling_path))
|
||||
code, stdout, stderr = run_command(["scrapling", "--help"])
|
||||
output = (stdout + "\n" + stderr).strip()
|
||||
|
||||
if code == 0:
|
||||
print("status: working")
|
||||
return True
|
||||
|
||||
print("status: broken")
|
||||
if "install scrapling with any of the extras" in output.lower() or "no module named 'click'" in output.lower():
|
||||
print("cause: installed without CLI extras")
|
||||
print("fix: `uv tool uninstall scrapling` then `uv tool install 'scrapling[shell]'`")
|
||||
else:
|
||||
print("cause: unknown")
|
||||
|
||||
if output:
|
||||
print("details:")
|
||||
print(output[:1200])
|
||||
return False
|
||||
|
||||
|
||||
def diagnose_browsers() -> None:
|
||||
print_section("Browser Runtime")
|
||||
chromium, headless_shell = detect_browser_cache()
|
||||
print("chromium: {0}".format("present" if chromium else "missing"))
|
||||
for path in chromium:
|
||||
print(" - {0}".format(path))
|
||||
print("chrome-headless-shell: {0}".format("present" if headless_shell else "missing"))
|
||||
for path in headless_shell:
|
||||
print(" - {0}".format(path))
|
||||
if not chromium or not headless_shell:
|
||||
print("hint: run `scrapling install` before browser-backed fetches")
|
||||
|
||||
|
||||
def preview_file(path: Path, preview_lines: int) -> None:
|
||||
print_section("Smoke Test Output")
|
||||
if not path.exists():
|
||||
print("status: missing output file")
|
||||
return
|
||||
|
||||
size = path.stat().st_size
|
||||
print("path: {0}".format(path))
|
||||
print("bytes: {0}".format(size))
|
||||
if size == 0:
|
||||
print("status: empty")
|
||||
return
|
||||
|
||||
if path.suffix in (".md", ".txt"):
|
||||
print("preview:")
|
||||
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
||||
for index, line in enumerate(handle):
|
||||
if index >= preview_lines:
|
||||
break
|
||||
print(line.rstrip())
|
||||
|
||||
|
||||
def run_smoke_test(args: argparse.Namespace) -> int:
|
||||
print_section("Smoke Test")
|
||||
|
||||
suffix = ".html"
|
||||
if args.selector:
|
||||
suffix = ".md"
|
||||
|
||||
output_path = Path(tempfile.gettempdir()) / ("scrapling-smoke" + suffix)
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
|
||||
cmd = ["scrapling", "extract", "fetch" if args.dynamic else "get", args.url, str(output_path)]
|
||||
if args.selector:
|
||||
cmd.extend(["-s", args.selector])
|
||||
if args.dynamic:
|
||||
cmd.extend(["--timeout", str(args.timeout)])
|
||||
elif args.no_verify:
|
||||
cmd.append("--no-verify")
|
||||
|
||||
print("command: {0}".format(" ".join(cmd)))
|
||||
code, stdout, stderr = run_command(cmd)
|
||||
if stdout.strip():
|
||||
print(stdout.strip())
|
||||
if stderr.strip():
|
||||
print(stderr.strip())
|
||||
|
||||
preview_file(output_path, args.preview_lines)
|
||||
return code
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Diagnose Scrapling and run an optional smoke test.")
|
||||
parser.add_argument("--url", help="Optional URL for a smoke test")
|
||||
parser.add_argument("--selector", help="Optional CSS selector for the smoke test")
|
||||
parser.add_argument(
|
||||
"--dynamic",
|
||||
action="store_true",
|
||||
help="Use `scrapling extract fetch` instead of `scrapling extract get`",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-verify",
|
||||
action="store_true",
|
||||
help="Pass `--no-verify` to static smoke tests",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=20000,
|
||||
help="Timeout in milliseconds for dynamic smoke tests",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preview-lines",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Number of preview lines for markdown/text output",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
cli_ok = diagnose_cli()
|
||||
diagnose_browsers()
|
||||
|
||||
if not cli_ok:
|
||||
return 1
|
||||
|
||||
if not args.url:
|
||||
return 0
|
||||
|
||||
return run_smoke_test(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -15,6 +15,13 @@ import re
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PACKAGE_ROOT = SCRIPT_DIR.parent
|
||||
if str(PACKAGE_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PACKAGE_ROOT))
|
||||
|
||||
from scripts.quick_validate import validate_skill
|
||||
from scripts.security_scan import calculate_skill_hash
|
||||
|
||||
@@ -41,7 +48,7 @@ def should_exclude(rel_path: Path) -> bool:
|
||||
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
|
||||
|
||||
|
||||
def validate_security_marker(skill_path: Path) -> tuple[bool, str]:
|
||||
def validate_security_marker(skill_path: Path) -> Tuple[bool, str]:
|
||||
"""
|
||||
Validate security marker file exists and hash matches current content
|
||||
|
||||
|
||||
Reference in New Issue
Block a user