release: add scrapling-skill and fix script compatibility

- add scrapling-skill with validated CLI workflow, diagnostics, packaging, and docs integration
- fix skill-creator package_skill.py so direct script invocation works from repo root
- fix continue-claude-work extract_resume_context.py typing compatibility for local python3
- bump marketplace to 1.39.0 and updated skill versions
This commit is contained in:
daymade
2026-03-18 23:08:55 +08:00
parent d8a7d45e53
commit 2192458ef7
13 changed files with 722 additions and 36 deletions

View File

@@ -5,8 +5,8 @@
"email": "daymadev89@gmail.com" "email": "daymadev89@gmail.com"
}, },
"metadata": { "metadata": {
"description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, local Claude session continuation from `.claude` artifacts, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, multi-path parallel product analysis with cross-model test-time compute scaling, real financial data collection for US equities with validation and yfinance pitfall handling, advanced Excel automation for formatted workbook generation and complex xlsm parsing, and macOS programmatic window screenshot capture workflows", "description": "Professional Claude Code skills for GitHub operations, document conversion, diagram generation, statusline customization, Teams communication, repomix utilities, skill creation, CLI demo generation, LLM icon access, Cloudflare troubleshooting, UI design system extraction, professional presentation creation, YouTube video downloading, secure repomix packaging, ASR transcription correction, video comparison quality analysis, comprehensive QA testing infrastructure, prompt optimization with EARS methodology, session history recovery, local Claude session continuation from `.claude` artifacts, documentation cleanup, format-controlled deep research report generation with evidence tracking, PDF generation with Chinese font support, CLAUDE.md progressive disclosure optimization, CCPM skill registry search and management, Promptfoo LLM evaluation framework, iOS app development with XcodeGen and SwiftUI, fact-checking with automated corrections, Twitter/X content fetching, intelligent macOS disk space recovery, skill quality review and improvement, GitHub contribution strategy, complete internationalization/localization setup, plugin/skill troubleshooting with diagnostic tools, evidence-based competitor analysis with source citations, Windows Remote Desktop (AVD/W365) connection quality diagnosis with transport protocol analysis and log parsing, Tailscale+proxy conflict diagnosis with SSH tunnel SOP for remote development, multi-path parallel product analysis with cross-model test-time compute scaling, real financial data collection for US equities with validation and yfinance pitfall handling, advanced Excel automation for formatted workbook generation and complex xlsm parsing, macOS programmatic window screenshot capture workflows, and verified Scrapling CLI installation and web extraction workflows",
"version": "1.38.0", "version": "1.39.0",
"homepage": "https://github.com/daymade/claude-code-skills" "homepage": "https://github.com/daymade/claude-code-skills"
}, },
"plugins": [ "plugins": [
@@ -15,7 +15,7 @@
"description": "Essential meta-skill for creating effective Claude Code skills with initialization scripts, validation, packaging, marketplace registration, and privacy best practices", "description": "Essential meta-skill for creating effective Claude Code skills with initialization scripts, validation, packaging, marketplace registration, and privacy best practices",
"source": "./", "source": "./",
"strict": false, "strict": false,
"version": "1.5.0", "version": "1.5.1",
"category": "developer-tools", "category": "developer-tools",
"keywords": [ "keywords": [
"skill-creation", "skill-creation",
@@ -500,7 +500,7 @@
"description": "Develops iOS applications with XcodeGen, SwiftUI, and SPM. Use when configuring XcodeGen project.yml, resolving SPM dependency issues, deploying to devices, handling code signing, debugging camera/AVFoundation, iOS version compatibility issues, or fixing Library not loaded @rpath framework errors. Includes state machine testing patterns for @MainActor classes", "description": "Develops iOS applications with XcodeGen, SwiftUI, and SPM. Use when configuring XcodeGen project.yml, resolving SPM dependency issues, deploying to devices, handling code signing, debugging camera/AVFoundation, iOS version compatibility issues, or fixing Library not loaded @rpath framework errors. Includes state machine testing patterns for @MainActor classes",
"source": "./", "source": "./",
"strict": false, "strict": false,
"version": "1.1.1", "version": "1.1.0",
"category": "developer-tools", "category": "developer-tools",
"keywords": [ "keywords": [
"ios", "ios",
@@ -565,7 +565,7 @@
"description": "Intelligent macOS disk space analysis and cleanup with safety-first philosophy. Use when users report disk space issues, need to clean their Mac, or want to understand storage consumption. Analyzes system caches, application remnants, large files, and development environments (Docker, Homebrew, npm, pip) with risk categorization (Safe/Caution/Keep) and requires explicit user confirmation before any deletions. Includes Mole visual tool integration for hybrid workflow", "description": "Intelligent macOS disk space analysis and cleanup with safety-first philosophy. Use when users report disk space issues, need to clean their Mac, or want to understand storage consumption. Analyzes system caches, application remnants, large files, and development environments (Docker, Homebrew, npm, pip) with risk categorization (Safe/Caution/Keep) and requires explicit user confirmation before any deletions. Includes Mole visual tool integration for hybrid workflow",
"source": "./", "source": "./",
"strict": false, "strict": false,
"version": "1.1.1", "version": "1.1.0",
"category": "utilities", "category": "utilities",
"keywords": [ "keywords": [
"macos", "macos",
@@ -882,7 +882,7 @@
"description": "Recover actionable context from local `.claude` session artifacts and continue interrupted work without running `claude --resume`. Extracts compact boundary summaries, subagent workflow state, session end reason, and workspace drift via bundled Python script. Use when a user provides a Claude session ID, asks to continue prior work from local history, or wants to inspect `.claude` files before resuming implementation", "description": "Recover actionable context from local `.claude` session artifacts and continue interrupted work without running `claude --resume`. Extracts compact boundary summaries, subagent workflow state, session end reason, and workspace drift via bundled Python script. Use when a user provides a Claude session ID, asks to continue prior work from local history, or wants to inspect `.claude` files before resuming implementation",
"source": "./", "source": "./",
"strict": false, "strict": false,
"version": "1.1.0", "version": "1.1.1",
"category": "developer-tools", "category": "developer-tools",
"keywords": [ "keywords": [
"claude-code", "claude-code",
@@ -898,6 +898,27 @@
"skills": [ "skills": [
"./continue-claude-work" "./continue-claude-work"
] ]
},
{
"name": "scrapling-skill",
"description": "Install, troubleshoot, and use Scrapling CLI for extracting HTML, Markdown, or text from webpages. Diagnoses missing extras, Playwright browser runtime issues, TLS verification failures, and WeChat public article extraction patterns. Use when users mention Scrapling, `scrapling extract`, `uv tool install scrapling`, or need to decide between static and browser-backed fetching",
"source": "./",
"strict": false,
"version": "1.0.0",
"category": "developer-tools",
"keywords": [
"scrapling",
"web-scraping",
"html",
"markdown",
"playwright",
"wechat",
"extraction",
"cli"
],
"skills": [
"./scrapling-skill"
]
} }
] ]
} }

View File

@@ -10,6 +10,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### Added
- None - None
## [1.39.0] - 2026-03-18
### Added
- **New Skill**: scrapling-skill v1.0.0 - Reliable Scrapling CLI installation, troubleshooting, and extraction workflows for HTML, Markdown, and text output
- Bundled `diagnose_scrapling.py` script to verify CLI health, detect missing extras, inspect Playwright browser runtime, and run real smoke tests
- Static-first workflow for choosing between `extract get`, `extract fetch`, and `stealthy-fetch`
- Verified WeChat public article extraction pattern using `#js_content`
- Verified recovery path for local TLS trust-store failures via `--no-verify`
- Bundled troubleshooting reference covering extras, browser runtime, and output validation
### Changed
- **skill-creator** v1.5.0 → v1.5.1: Fixed `scripts/package_skill.py` so it works when invoked directly from the repository root instead of only via `python -m`
- **continue-claude-work** v1.1.0 → v1.1.1: Replaced newer Python-only type syntax in `extract_resume_context.py` so the script runs under the local `python3` environment
- Updated marketplace skills/plugins count from 42 to 43
- Updated marketplace version from 1.38.0 to 1.39.0
- Updated marketplace metadata description to include Scrapling CLI extraction workflows
- Updated README.md and README.zh-CN.md badges, installation commands, skill listings, use cases, quick links, and requirements
- Updated CLAUDE.md counts, version reference, and Available Skills list (added #43)
## [1.38.0] - 2026-03-07 ## [1.38.0] - 2026-03-07
### Added ### Added

View File

@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
## Repository Overview ## Repository Overview
This is a Claude Code skills marketplace containing 42 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources. This is a Claude Code skills marketplace containing 43 production-ready skills organized in a plugin marketplace structure. Each skill is a self-contained package that extends Claude's capabilities with specialized knowledge, workflows, and bundled resources.
**Essential Skill**: `skill-creator` is the most important skill in this marketplace - it's a meta-skill that enables users to create their own skills. Always recommend it first for users interested in extending Claude Code. **Essential Skill**: `skill-creator` is the most important skill in this marketplace - it's a meta-skill that enables users to create their own skills. Always recommend it first for users interested in extending Claude Code.
@@ -134,7 +134,7 @@ Skills for public distribution must NOT contain:
## Marketplace Configuration ## Marketplace Configuration
The marketplace is configured in `.claude-plugin/marketplace.json`: The marketplace is configured in `.claude-plugin/marketplace.json`:
- Contains 42 plugins, each mapping to one skill - Contains 43 plugins, each mapping to one skill
- Each plugin has: name, description, version, category, keywords, skills array - Each plugin has: name, description, version, category, keywords, skills array
- Marketplace metadata: name, owner, version, homepage - Marketplace metadata: name, owner, version, homepage
@@ -144,7 +144,7 @@ The marketplace is configured in `.claude-plugin/marketplace.json`:
1. **Marketplace Version** (`.claude-plugin/marketplace.json``metadata.version`) 1. **Marketplace Version** (`.claude-plugin/marketplace.json``metadata.version`)
- Tracks the marketplace catalog as a whole - Tracks the marketplace catalog as a whole
- Current: v1.38.0 - Current: v1.39.0
- Bump when: Adding/removing skills, major marketplace restructuring - Bump when: Adding/removing skills, major marketplace restructuring
- Semantic versioning: MAJOR.MINOR.PATCH - Semantic versioning: MAJOR.MINOR.PATCH
@@ -219,6 +219,7 @@ This applies when you change ANY file under a skill directory:
40. **excel-automation** - Create formatted Excel files, parse complex xlsm models, and control Excel windows on macOS via AppleScript 40. **excel-automation** - Create formatted Excel files, parse complex xlsm models, and control Excel windows on macOS via AppleScript
41. **capture-screen** - Programmatically capture macOS application windows using Swift window ID discovery and screencapture workflows 41. **capture-screen** - Programmatically capture macOS application windows using Swift window ID discovery and screencapture workflows
42. **continue-claude-work** - Recover local `.claude` session context via compact-boundary extraction, subagent workflow recovery, and session end reason detection, then continue interrupted work without `claude --resume` 42. **continue-claude-work** - Recover local `.claude` session context via compact-boundary extraction, subagent workflow recovery, and session end reason detection, then continue interrupted work without `claude --resume`
43. **scrapling-skill** - Install, troubleshoot, and use Scrapling CLI for static/dynamic web extraction, WeChat article capture, and verified output validation
**Recommendation**: Always suggest `skill-creator` first for users interested in creating skills or extending Claude Code. **Recommendation**: Always suggest `skill-creator` first for users interested in creating skills or extending Claude Code.

View File

@@ -6,15 +6,15 @@
[![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md) [![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Skills](https://img.shields.io/badge/skills-42-blue.svg)](https://github.com/daymade/claude-code-skills) [![Skills](https://img.shields.io/badge/skills-43-blue.svg)](https://github.com/daymade/claude-code-skills)
[![Version](https://img.shields.io/badge/version-1.38.0-green.svg)](https://github.com/daymade/claude-code-skills) [![Version](https://img.shields.io/badge/version-1.39.0-green.svg)](https://github.com/daymade/claude-code-skills)
[![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code) [![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity) [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
</div> </div>
Professional Claude Code skills marketplace featuring 42 production-ready skills for enhanced development workflows. Professional Claude Code skills marketplace featuring 43 production-ready skills for enhanced development workflows.
## 📑 Table of Contents ## 📑 Table of Contents
@@ -240,6 +240,9 @@ claude plugin install capture-screen@daymade-skills
# Resume interrupted Claude work from local session artifacts # Resume interrupted Claude work from local session artifacts
claude plugin install continue-claude-work@daymade-skills claude plugin install continue-claude-work@daymade-skills
# Scrapling CLI extraction and troubleshooting
claude plugin install scrapling-skill@daymade-skills
``` ```
Each skill can be installed independently - choose only what you need! Each skill can be installed independently - choose only what you need!
@@ -1787,6 +1790,44 @@ claude plugin install continue-claude-work@daymade-skills
--- ---
### 43. **scrapling-skill** - Reliable Scrapling CLI Workflows
Install, troubleshoot, and use Scrapling CLI with a verified static-first workflow for extracting HTML, Markdown, or text from webpages. Includes a diagnostic script for broken extras installs, Playwright browser runtime checks, and smoke tests against real URLs.
**When to use:**
- Users mention Scrapling, `uv tool install scrapling`, or `scrapling extract`
- You need to choose between static and browser-backed fetching
- You need to extract article bodies from WeChat public pages (`mp.weixin.qq.com`)
- A Scrapling install works partially but fails on missing extras, browser runtime, or TLS verification
**Key features:**
- Bundled `diagnose_scrapling.py` script for CLI, browser runtime, and live URL smoke tests
- Verified default path: start with `extract get`, escalate to `extract fetch` only when needed
- WeChat extraction pattern using `#js_content` for clean article Markdown
- Troubleshooting guidance for missing `click`, Playwright runtime setup, and `curl: (60)` trust-store failures
- Output validation workflow using file size and content checks instead of exit-code assumptions
**Example usage:**
```bash
# Install the skill
claude plugin install scrapling-skill@daymade-skills
# Then ask Claude to work through Scrapling for you
"Install Scrapling CLI and verify the setup"
"Extract this WeChat article into Markdown with Scrapling"
"Decide whether this page needs static or browser-backed fetching"
```
**🎬 Live Demo**
*Coming soon*
📚 **Documentation**: See [scrapling-skill/SKILL.md](./scrapling-skill/SKILL.md) and [scrapling-skill/references/troubleshooting.md](./scrapling-skill/references/troubleshooting.md).
**Requirements**: Python 3.6+, `uv`, Scrapling CLI, and Playwright browser runtime for browser-backed fetches.
---
## 🎬 Interactive Demo Gallery ## 🎬 Interactive Demo Gallery
Want to see all demos in one place with click-to-enlarge functionality? Check out our [interactive demo gallery](./demos/index.html) or browse the [demos directory](./demos/). Want to see all demos in one place with click-to-enlarge functionality? Check out our [interactive demo gallery](./demos/index.html) or browse the [demos directory](./demos/).
@@ -1853,6 +1894,9 @@ Use **claude-code-history-files-finder** to recover deleted files from previous
### For Resuming Interrupted Claude Sessions ### For Resuming Interrupted Claude Sessions
Use **continue-claude-work** to recover the last actionable request from local `~/.claude` artifacts and continue implementation without reopening the original session. Combine with **claude-code-history-files-finder** when you need broader cross-session search, statistics, or deleted-file recovery. Use **continue-claude-work** to recover the last actionable request from local `~/.claude` artifacts and continue implementation without reopening the original session. Combine with **claude-code-history-files-finder** when you need broader cross-session search, statistics, or deleted-file recovery.
### For Web Extraction & WeChat Articles
Use **scrapling-skill** to install and validate Scrapling CLI, choose between static and browser-backed fetching, and extract clean Markdown from sites like `mp.weixin.qq.com`. Combine with **deep-research** to turn extracted sources into structured reports or with **docs-cleaner** to normalize captured article content.
### For Documentation Maintenance ### For Documentation Maintenance
Use **docs-cleaner** to consolidate redundant documentation while preserving valuable content. Perfect for cleaning up documentation sprawl after rapid development phases or merging overlapping docs into authoritative sources. Use **docs-cleaner** to consolidate redundant documentation while preserving valuable content. Perfect for cleaning up documentation sprawl after rapid development phases or merging overlapping docs into authoritative sources.
@@ -1941,6 +1985,7 @@ Each skill includes:
- **excel-automation**: See `excel-automation/SKILL.md` for create/parse/control workflows and `excel-automation/references/formatting-reference.md` for formatting standards - **excel-automation**: See `excel-automation/SKILL.md` for create/parse/control workflows and `excel-automation/references/formatting-reference.md` for formatting standards
- **capture-screen**: See `capture-screen/SKILL.md` for CGWindowID-based screenshot workflows on macOS - **capture-screen**: See `capture-screen/SKILL.md` for CGWindowID-based screenshot workflows on macOS
- **continue-claude-work**: See `continue-claude-work/SKILL.md` for local artifact recovery, drift checks, and resume workflow - **continue-claude-work**: See `continue-claude-work/SKILL.md` for local artifact recovery, drift checks, and resume workflow
- **scrapling-skill**: See `scrapling-skill/SKILL.md` for the CLI workflow and `scrapling-skill/references/troubleshooting.md` for verified Scrapling failure modes
## 🛠️ Requirements ## 🛠️ Requirements
@@ -1967,6 +2012,7 @@ Each skill includes:
- **uv + openpyxl** (for excel-automation): `uv run --with openpyxl ...` - **uv + openpyxl** (for excel-automation): `uv run --with openpyxl ...`
- **macOS** (for capture-screen and excel-automation AppleScript control workflows) - **macOS** (for capture-screen and excel-automation AppleScript control workflows)
- **Python 3.8+** (for continue-claude-work): bundled script for session extraction (no external dependencies) - **Python 3.8+** (for continue-claude-work): bundled script for session extraction (no external dependencies)
- **uv + Scrapling CLI** (for scrapling-skill): `uv tool install 'scrapling[shell]'` and `scrapling install` for browser-backed fetches
## ❓ FAQ ## ❓ FAQ

View File

@@ -6,15 +6,15 @@
[![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md) [![简体中文](https://img.shields.io/badge/语言-简体中文-red)](./README.zh-CN.md)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Skills](https://img.shields.io/badge/skills-42-blue.svg)](https://github.com/daymade/claude-code-skills) [![Skills](https://img.shields.io/badge/skills-43-blue.svg)](https://github.com/daymade/claude-code-skills)
[![Version](https://img.shields.io/badge/version-1.38.0-green.svg)](https://github.com/daymade/claude-code-skills) [![Version](https://img.shields.io/badge/version-1.39.0-green.svg)](https://github.com/daymade/claude-code-skills)
[![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code) [![Claude Code](https://img.shields.io/badge/Claude%20Code-2.0.13+-purple.svg)](https://claude.com/code)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity) [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/daymade/claude-code-skills/graphs/commit-activity)
</div> </div>
专业的 Claude Code 技能市场,提供 42 个生产就绪的技能,用于增强开发工作流。 专业的 Claude Code 技能市场,提供 43 个生产就绪的技能,用于增强开发工作流。
## 📑 目录 ## 📑 目录
@@ -243,6 +243,9 @@ claude plugin install capture-screen@daymade-skills
# 基于本地会话产物续做中断的 Claude 工作 # 基于本地会话产物续做中断的 Claude 工作
claude plugin install continue-claude-work@daymade-skills claude plugin install continue-claude-work@daymade-skills
# Scrapling CLI 抽取与故障排查
claude plugin install scrapling-skill@daymade-skills
``` ```
每个技能都可以独立安装 - 只选择你需要的! 每个技能都可以独立安装 - 只选择你需要的!
@@ -1829,6 +1832,44 @@ claude plugin install continue-claude-work@daymade-skills
--- ---
### 43. **scrapling-skill** - 可靠的 Scrapling CLI 工作流
围绕 Scrapling CLI 提供经过验证的安装、排障与网页抽取工作流,用于从网页输出 HTML、Markdown 或纯文本。内置诊断脚本,可检查 extras 安装问题、Playwright 浏览器运行时,以及真实 URL 的烟测结果。
**使用场景:**
- 用户提到 Scrapling、`uv tool install scrapling``scrapling extract`
- 需要判断应该使用静态抓取还是浏览器抓取
- 需要从微信公众号页面(`mp.weixin.qq.com`)提取正文
- Scrapling 安装看似成功,但在 extras、浏览器运行时或 TLS 校验上失败
**主要功能:**
- 内置 `diagnose_scrapling.py`,检查 CLI、浏览器运行时与真实 URL 烟测
- 经过验证的默认路径:先用 `extract get`,只有必要时再升级到 `extract fetch`
- 针对微信公众号文章的 `#js_content` 提取模式
- 覆盖缺少 `click`、Playwright 运行时缺失、`curl: (60)` 证书问题等真实故障
- 用文件大小和内容验证结果,而不是只看退出码
**示例用法:**
```bash
# 安装技能
claude plugin install scrapling-skill@daymade-skills
# 然后让 Claude 代你跑 Scrapling
"安装 Scrapling CLI 并验证配置"
"用 Scrapling 把这篇微信公众号文章提取成 Markdown"
"判断这个页面应不应该走浏览器抓取"
```
**🎬 实时演示**
*即将推出*
📚 **文档**:参见 [scrapling-skill/SKILL.md](./scrapling-skill/SKILL.md) 和 [scrapling-skill/references/troubleshooting.md](./scrapling-skill/references/troubleshooting.md)。
**要求**Python 3.6+、`uv`、Scrapling CLI如需浏览器抓取还需要 Playwright 浏览器运行时。
---
## 🎬 交互式演示画廊 ## 🎬 交互式演示画廊
想要在一个地方查看所有演示并具有点击放大功能?访问我们的[交互式演示画廊](./demos/index.html)或浏览[演示目录](./demos/)。 想要在一个地方查看所有演示并具有点击放大功能?访问我们的[交互式演示画廊](./demos/index.html)或浏览[演示目录](./demos/)。
@@ -1895,6 +1936,9 @@ claude plugin install continue-claude-work@daymade-skills
### 续做中断的 Claude 会话 ### 续做中断的 Claude 会话
使用 **continue-claude-work** 从本地 `~/.claude` 产物中恢复最后一个可执行请求,并在不重新打开原始会话的情况下继续实现。若还需要跨会话搜索、统计分析或恢复已删除文件,可与 **claude-code-history-files-finder** 配合使用。 使用 **continue-claude-work** 从本地 `~/.claude` 产物中恢复最后一个可执行请求,并在不重新打开原始会话的情况下继续实现。若还需要跨会话搜索、统计分析或恢复已删除文件,可与 **claude-code-history-files-finder** 配合使用。
### 网页提取与微信公众号文章
使用 **scrapling-skill** 安装并验证 Scrapling CLI判断应使用静态抓取还是浏览器抓取并从 `mp.weixin.qq.com` 等页面提取干净的 Markdown。可与 **deep-research** 配合,将抓取内容整理为结构化报告,或与 **docs-cleaner** 配合清理抽取后的文章内容。
### 文档维护 ### 文档维护
使用 **docs-cleaner** 在保留有价值内容的同时整合冗余文档。非常适合在快速开发阶段后清理文档扩散或将重叠的文档合并为权威来源。 使用 **docs-cleaner** 在保留有价值内容的同时整合冗余文档。非常适合在快速开发阶段后清理文档扩散或将重叠的文档合并为权威来源。
@@ -1983,6 +2027,7 @@ claude plugin install continue-claude-work@daymade-skills
- **excel-automation**:参见 `excel-automation/SKILL.md` 了解创建/解析/控制工作流,参见 `excel-automation/references/formatting-reference.md` 了解格式规范 - **excel-automation**:参见 `excel-automation/SKILL.md` 了解创建/解析/控制工作流,参见 `excel-automation/references/formatting-reference.md` 了解格式规范
- **capture-screen**:参见 `capture-screen/SKILL.md` 了解基于 CGWindowID 的 macOS 截图流程 - **capture-screen**:参见 `capture-screen/SKILL.md` 了解基于 CGWindowID 的 macOS 截图流程
- **continue-claude-work**:参见 `continue-claude-work/SKILL.md` 了解本地会话产物恢复、漂移检查与续做流程 - **continue-claude-work**:参见 `continue-claude-work/SKILL.md` 了解本地会话产物恢复、漂移检查与续做流程
- **scrapling-skill**:参见 `scrapling-skill/SKILL.md` 了解 CLI 工作流,参见 `scrapling-skill/references/troubleshooting.md` 了解已验证的 Scrapling 故障模式
## 🛠️ 系统要求 ## 🛠️ 系统要求
@@ -2006,6 +2051,7 @@ claude plugin install continue-claude-work@daymade-skills
- **uv + openpyxl**(用于 excel-automation`uv run --with openpyxl ...` - **uv + openpyxl**(用于 excel-automation`uv run --with openpyxl ...`
- **macOS**(用于 capture-screen 与 excel-automation 的 AppleScript 控制流程) - **macOS**(用于 capture-screen 与 excel-automation 的 AppleScript 控制流程)
- **Python 3.8+**(用于 continue-claude-work内置脚本进行会话提取无外部依赖 - **Python 3.8+**(用于 continue-claude-work内置脚本进行会话提取无外部依赖
- **uv + Scrapling CLI**(用于 scrapling-skill`uv tool install 'scrapling[shell]'`,浏览器抓取前运行 `scrapling install`
## ❓ 常见问题 ## ❓ 常见问题

View File

@@ -1,4 +1,4 @@
Security scan passed Security scan passed
Scanned at: 2026-03-07T14:27:12.638956 Scanned at: 2026-03-18T23:02:18.627209
Tool: gitleaks + pattern-based validation Tool: gitleaks + pattern-based validation
Content hash: c464aa735e8b7832c2c77e4cea22fff9c7e6117ecee4f6769f5eb62cced8a11a Content hash: 62e456422cabfe74e5757a802044dac45d1662341b2e90dc943685db81d8f659

View File

@@ -37,6 +37,7 @@ import subprocess
import sys import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional
CLAUDE_DIR = Path.home() / ".claude" CLAUDE_DIR = Path.home() / ".claude"
PROJECTS_DIR = CLAUDE_DIR / "projects" PROJECTS_DIR = CLAUDE_DIR / "projects"
@@ -59,7 +60,7 @@ def normalize_path(project_path: str) -> str:
return project_path.replace("/", "-") return project_path.replace("/", "-")
def find_project_dir(project_path: str) -> Path | None: def find_project_dir(project_path: str) -> Optional[Path]:
"""Find the Claude projects directory for a given project path.""" """Find the Claude projects directory for a given project path."""
abs_path = os.path.abspath(project_path) abs_path = os.path.abspath(project_path)
@@ -85,7 +86,7 @@ def find_project_dir(project_path: str) -> Path | None:
return None return None
def load_sessions_index(project_dir: Path) -> list[dict]: def load_sessions_index(project_dir: Path) -> List[Dict]:
"""Load and parse sessions-index.json, sorted by modified desc.""" """Load and parse sessions-index.json, sorted by modified desc."""
index_file = project_dir / "sessions-index.json" index_file = project_dir / "sessions-index.json"
if not index_file.exists(): if not index_file.exists():
@@ -97,7 +98,7 @@ def load_sessions_index(project_dir: Path) -> list[dict]:
return entries return entries
def search_sessions(entries: list[dict], query: str) -> list[dict]: def search_sessions(entries: List[Dict], query: str) -> List[Dict]:
"""Search sessions by keyword in firstPrompt and summary.""" """Search sessions by keyword in firstPrompt and summary."""
query_lower = query.lower() query_lower = query.lower()
results = [] results = []
@@ -109,7 +110,7 @@ def search_sessions(entries: list[dict], query: str) -> list[dict]:
return results return results
def format_session_entry(entry: dict, file_exists: bool = True) -> str: def format_session_entry(entry: Dict, file_exists: bool = True) -> str:
"""Format a session index entry for display.""" """Format a session index entry for display."""
sid = entry.get("sessionId", "?") sid = entry.get("sessionId", "?")
modified = entry.get("modified", "?") modified = entry.get("modified", "?")
@@ -123,7 +124,7 @@ def format_session_entry(entry: dict, file_exists: bool = True) -> str:
# ── Session file parsing ──────────────────────────────────────────── # ── Session file parsing ────────────────────────────────────────────
def parse_session_structure(session_file: Path) -> dict: def parse_session_structure(session_file: Path) -> Dict:
"""Parse a session JSONL file and return structured data.""" """Parse a session JSONL file and return structured data."""
file_size = session_file.stat().st_size file_size = session_file.stat().st_size
total_lines = 0 total_lines = 0
@@ -276,8 +277,8 @@ def parse_session_structure(session_file: Path) -> dict:
def _detect_end_reason( def _detect_end_reason(
last_role: str | None, last_role: Optional[str],
unresolved: dict, unresolved: Dict,
error_count: int, error_count: int,
) -> str: ) -> str:
"""Detect why the session ended.""" """Detect why the session ended."""
@@ -300,7 +301,7 @@ def _is_noise_user_text(text: str) -> bool:
return False return False
def extract_user_text(messages: list[dict], limit: int = 5) -> list[str]: def extract_user_text(messages: List[Dict], limit: int = 5) -> List[str]:
"""Extract the last N user text messages (not tool results or system noise).""" """Extract the last N user text messages (not tool results or system noise)."""
user_texts = [] user_texts = []
for msg_obj in reversed(messages): for msg_obj in reversed(messages):
@@ -329,7 +330,7 @@ def extract_user_text(messages: list[dict], limit: int = 5) -> list[str]:
return user_texts return user_texts
def extract_assistant_text(messages: list[dict], limit: int = 3) -> list[str]: def extract_assistant_text(messages: List[Dict], limit: int = 3) -> List[str]:
"""Extract the last N assistant text responses (no thinking/tool_use).""" """Extract the last N assistant text responses (no thinking/tool_use)."""
assistant_texts = [] assistant_texts = []
for msg_obj in reversed(messages): for msg_obj in reversed(messages):
@@ -355,7 +356,7 @@ def extract_assistant_text(messages: list[dict], limit: int = 3) -> list[str]:
# ── Subagent extraction ────────────────────────────────────────────── # ── Subagent extraction ──────────────────────────────────────────────
def extract_subagent_context(session_file: Path) -> list[dict]: def extract_subagent_context(session_file: Path) -> List[Dict]:
"""Extract subagent summaries from session subdirectories. """Extract subagent summaries from session subdirectories.
Returns list of {name, type, status, last_text, is_interrupted}. Returns list of {name, type, status, last_text, is_interrupted}.
@@ -457,7 +458,8 @@ def get_git_state(project_path: str) -> str:
try: try:
branch = subprocess.run( branch = subprocess.run(
["git", "branch", "--show-current"], ["git", "branch", "--show-current"],
capture_output=True, text=True, cwd=project_path, timeout=5, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, cwd=project_path, timeout=5,
) )
if branch.stdout.strip(): if branch.stdout.strip():
parts.append(f"**Current branch**: `{branch.stdout.strip()}`") parts.append(f"**Current branch**: `{branch.stdout.strip()}`")
@@ -467,7 +469,8 @@ def get_git_state(project_path: str) -> str:
try: try:
status = subprocess.run( status = subprocess.run(
["git", "status", "--short"], ["git", "status", "--short"],
capture_output=True, text=True, cwd=project_path, timeout=10, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, cwd=project_path, timeout=10,
) )
if status.stdout.strip(): if status.stdout.strip():
parts.append(f"### git status\n```\n{status.stdout.strip()}\n```") parts.append(f"### git status\n```\n{status.stdout.strip()}\n```")
@@ -479,7 +482,8 @@ def get_git_state(project_path: str) -> str:
try: try:
log = subprocess.run( log = subprocess.run(
["git", "log", "--oneline", "-5"], ["git", "log", "--oneline", "-5"],
capture_output=True, text=True, cwd=project_path, timeout=10, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, cwd=project_path, timeout=10,
) )
if log.stdout.strip(): if log.stdout.strip():
parts.append(f"### git log (last 5)\n```\n{log.stdout.strip()}\n```") parts.append(f"### git log (last 5)\n```\n{log.stdout.strip()}\n```")
@@ -489,7 +493,7 @@ def get_git_state(project_path: str) -> str:
return "\n\n".join(parts) return "\n\n".join(parts)
def get_memory_md(project_dir: Path) -> str | None: def get_memory_md(project_dir: Path) -> Optional[str]:
"""Read MEMORY.md if it exists in the project's memory directory.""" """Read MEMORY.md if it exists in the project's memory directory."""
memory_dir = project_dir / "memory" memory_dir = project_dir / "memory"
memory_file = memory_dir / "MEMORY.md" memory_file = memory_dir / "MEMORY.md"
@@ -500,7 +504,7 @@ def get_memory_md(project_dir: Path) -> str | None:
return None return None
def get_session_memory(session_file: Path) -> str | None: def get_session_memory(session_file: Path) -> Optional[str]:
"""Read session-memory/summary.md if it exists (newer CC versions).""" """Read session-memory/summary.md if it exists (newer CC versions)."""
session_dir = session_file.parent / session_file.stem session_dir = session_file.parent / session_file.stem
summary = session_dir / "session-memory" / "summary.md" summary = session_dir / "session-memory" / "summary.md"
@@ -524,8 +528,8 @@ END_REASON_LABELS = {
def build_briefing( def build_briefing(
session_entry: dict | None, session_entry: Optional[Dict],
parsed: dict, parsed: Dict,
project_path: str, project_path: str,
project_dir: Path, project_dir: Path,
session_file: Path, session_file: Path,
@@ -658,7 +662,7 @@ def build_briefing(
# ── CLI ────────────────────────────────────────────────────────────── # ── CLI ──────────────────────────────────────────────────────────────
def _check_session_files(entries: list[dict], project_dir: Path) -> dict[str, bool]: def _check_session_files(entries: List[Dict], project_dir: Path) -> Dict[str, bool]:
"""Check which index entries have actual files on disk.""" """Check which index entries have actual files on disk."""
status = {} status = {}
for entry in entries: for entry in entries:

BIN
scrapling-skill.skill Normal file

Binary file not shown.

View File

@@ -0,0 +1,4 @@
Security scan passed
Scanned at: 2026-03-18T22:52:43.734452
Tool: gitleaks + pattern-based validation
Content hash: 06351e5794510c584fdf29351eb5161f4b12e213f512c3148212c82c357d124a

183
scrapling-skill/SKILL.md Normal file
View File

@@ -0,0 +1,183 @@
---
name: scrapling-skill
description: Install, troubleshoot, and use Scrapling CLI to extract HTML, Markdown, or text from webpages. Use this skill whenever the user mentions Scrapling, `uv tool install scrapling`, `scrapling extract`, WeChat/mp.weixin articles, browser-backed page fetching, or needs help deciding between static and dynamic extraction.
---
# Scrapling Skill
## Overview
Use Scrapling through its CLI as the default path. Start with the smallest working command, validate the saved output, and only escalate to browser-backed fetching when the static fetch does not contain the real page content.
Do not assume the user's Scrapling install is healthy. Verify it first.
## Default Workflow
Copy this checklist and keep it updated while working:
```text
Scrapling Progress:
- [ ] Step 1: Diagnose the local Scrapling install
- [ ] Step 2: Fix CLI extras or browser runtime if needed
- [ ] Step 3: Choose static or dynamic fetch
- [ ] Step 4: Save output to a file
- [ ] Step 5: Validate file size and extracted content
- [ ] Step 6: Escalate only if the previous path failed
```
## Step 1: Diagnose the Install
Run the bundled diagnostic script first:
```bash
python3 scripts/diagnose_scrapling.py
```
Use the result as the source of truth for the next step.
## Step 2: Fix the Install
### If the CLI was installed without extras
If `scrapling --help` fails with missing `click` or a message about installing Scrapling with extras, reinstall it with the CLI extra:
```bash
uv tool uninstall scrapling
uv tool install 'scrapling[shell]'
```
Do not default to `scrapling[all]` unless the user explicitly needs the broader feature set.
### If browser-backed fetchers are needed
Install the Playwright runtime:
```bash
scrapling install
```
If the install looks slow or opaque, read `references/troubleshooting.md` before guessing. Do not claim success until either:
- `scrapling install` reports that dependencies are already installed, or
- the diagnostic script confirms both Chromium and Chrome Headless Shell are present.
## Step 3: Choose the Fetcher
Use this decision rule:
- Start with `extract get` for normal pages, article pages, and most WeChat public articles.
- Use `extract fetch` when the static HTML does not contain the real content or the page depends on JavaScript rendering.
- Use `extract stealthy-fetch` only after `fetch` still fails because of anti-bot or challenge behavior. Do not make it the default.
## Step 4: Run the Smallest Useful Command
Always quote URLs in shell commands. This is mandatory in `zsh` when the URL contains `?`, `&`, or other special characters.
### Full page to HTML
```bash
scrapling extract get 'https://example.com' page.html
```
### Main content to Markdown
```bash
scrapling extract get 'https://example.com' article.md -s 'main'
```
### JS-rendered page with browser automation
```bash
scrapling extract fetch 'https://example.com' page.html --timeout 20000
```
### WeChat public article body
Use `#js_content` first. This is the default selector for article body extraction on `mp.weixin.qq.com` pages.
```bash
scrapling extract get 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' article.md -s '#js_content'
```
## Step 5: Validate the Output
After every extraction, verify the file instead of assuming success:
```bash
wc -c article.md
sed -n '1,40p' article.md
```
For HTML output, check that the expected title, container, or selector target is actually present:
```bash
rg -n '<title>|js_content|rich_media_title|main' page.html
```
If the file is tiny, empty, or missing the expected container, the extraction did not succeed. Go back to Step 3 and switch fetchers or selectors.
## Step 6: Handle Known Failure Modes
### Local TLS trust store problem
If `extract get` fails with `curl: (60) SSL certificate problem`, treat it as a local trust-store problem first, not a Scrapling content failure.
Retry the same command with:
```bash
--no-verify
```
Only do this after confirming the failure matches the local certificate verification error pattern. Do not silently disable verification by default.
### WeChat article pages
For `mp.weixin.qq.com`:
- Try `extract get` before `extract fetch`
- Use `-s '#js_content'` for the article body
- Validate the saved Markdown or HTML immediately
### Browser-backed fetch failures
If `extract fetch` fails:
1. Re-check the install with `python3 scripts/diagnose_scrapling.py`
2. Confirm Chromium and Chrome Headless Shell are present
3. Retry with a slightly longer timeout
4. Escalate to `stealthy-fetch` only if the site behavior justifies it
## Command Patterns
### Diagnose and smoke test a URL
```bash
python3 scripts/diagnose_scrapling.py --url 'https://example.com'
```
### Diagnose and smoke test a WeChat article body
```bash
python3 scripts/diagnose_scrapling.py \
--url 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' \
--selector '#js_content' \
--no-verify
```
### Diagnose and smoke test a browser-backed fetch
```bash
python3 scripts/diagnose_scrapling.py \
--url 'https://example.com' \
--dynamic
```
## Guardrails
- Do not tell the user to reinstall blindly. Verify first.
- Do not default to the Python library API when the user is clearly asking about the CLI.
- Do not jump to browser-backed fetching unless the static result is missing the real content.
- Do not claim success from exit code alone. Inspect the saved file.
- Do not hardcode user-specific absolute paths into outputs or docs.
## Resources
- Installation and smoke test helper: `scripts/diagnose_scrapling.py`
- Verified failure modes and recovery paths: `references/troubleshooting.md`

View File

@@ -0,0 +1,164 @@
# Scrapling Troubleshooting
## Contents
- Installation modes
- Verified failure modes
- Static vs dynamic fetch choice
- WeChat extraction pattern
- Smoke test commands
## Installation Modes
Use the CLI path as the default:
```bash
uv tool install 'scrapling[shell]'
```
Do not assume `uv tool install scrapling` is enough for CLI usage. The base package may install the executable wrapper without the optional CLI dependencies.
## Verified Failure Modes
### 1. CLI installed without extras
Symptom:
- `scrapling --help` fails
- Output mentions missing `click`
- Output says Scrapling must be installed with extras
Recovery:
```bash
uv tool uninstall scrapling
uv tool install 'scrapling[shell]'
```
### 2. Browser-backed fetchers not ready
Symptom:
- `extract fetch` or `extract stealthy-fetch` fails because the Playwright runtime is not installed
- Scrapling has not downloaded Chromium or Chrome Headless Shell
Recovery:
```bash
scrapling install
```
Success signals:
- `scrapling install` later reports `The dependencies are already installed`
- Browser caches contain both:
- `chromium-*`
- `chromium_headless_shell-*`
Typical cache roots:
- `~/Library/Caches/ms-playwright/`
- `~/.cache/ms-playwright/`
### 3. Static fetch TLS trust-store failure
Symptom:
- `extract get` fails with `curl: (60) SSL certificate problem`
Interpretation:
- Treat this as a local certificate verification problem first
- Do not assume the target URL or Scrapling itself is broken
Recovery:
Retry the same static command with:
```bash
--no-verify
```
Do not make `--no-verify` the default. Use it only after the failure matches this certificate-verification pattern.
## Static vs Dynamic Fetch Choice
Use this order:
1. `extract get`
2. `extract fetch`
3. `extract stealthy-fetch`
Use `extract get` when:
- The page is mostly server-rendered
- The content is likely already present in raw HTML
- The target is an article page with a stable content container
Use `extract fetch` when:
- Static HTML does not contain the real content
- The site depends on JavaScript rendering
- The page content appears only after runtime hydration
Use `extract stealthy-fetch` when:
- `fetch` still fails
- The target site shows challenge or anti-bot behavior
## WeChat Extraction Pattern
For `mp.weixin.qq.com` public article pages:
- Start with `extract get`
- Use the selector `#js_content`
- Validate the saved file immediately
Example:
```bash
scrapling extract get 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' article.md -s '#js_content'
```
Observed behavior:
- The static fetch can already contain the real article body
- Browser-backed fetch is often unnecessary for article extraction
## Smoke Test Commands
### Basic diagnosis
```bash
python3 scripts/diagnose_scrapling.py
```
### Static extraction smoke test
```bash
python3 scripts/diagnose_scrapling.py --url 'https://example.com'
```
### WeChat article smoke test
```bash
python3 scripts/diagnose_scrapling.py \
--url 'https://mp.weixin.qq.com/s/ARTICLE_ID?scene=1' \
--selector '#js_content'
```
### Dynamic extraction smoke test
```bash
python3 scripts/diagnose_scrapling.py \
--url 'https://example.com' \
--dynamic
```
### Validate saved output
```bash
wc -c article.md
sed -n '1,40p' article.md
rg -n '<title>|js_content|main|rich_media_title' page.html
```

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python3
"""
Diagnose a local Scrapling CLI installation and optionally run a smoke test.
"""
import argparse
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Iterable, List, Tuple
def run_command(cmd: List[str]) -> Tuple[int, str, str]:
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
check=False,
)
return result.returncode, result.stdout, result.stderr
def print_section(title: str) -> None:
print("")
print(title)
print("-" * len(title))
def existing_dirs(paths: Iterable[Path]) -> List[str]:
return [str(path) for path in paths if path.exists()]
def detect_browser_cache() -> Tuple[List[str], List[str]]:
roots = [
Path.home() / "Library" / "Caches" / "ms-playwright",
Path.home() / ".cache" / "ms-playwright",
]
chromium = []
headless_shell = []
for root in roots:
if not root.exists():
continue
chromium.extend(existing_dirs(sorted(root.glob("chromium-*"))))
headless_shell.extend(existing_dirs(sorted(root.glob("chromium_headless_shell-*"))))
return chromium, headless_shell
def diagnose_cli() -> bool:
print_section("CLI")
scrapling_path = shutil.which("scrapling")
if not scrapling_path:
print("status: missing")
print("fix: install with `uv tool install 'scrapling[shell]'`")
return False
print("path: {0}".format(scrapling_path))
code, stdout, stderr = run_command(["scrapling", "--help"])
output = (stdout + "\n" + stderr).strip()
if code == 0:
print("status: working")
return True
print("status: broken")
if "install scrapling with any of the extras" in output.lower() or "no module named 'click'" in output.lower():
print("cause: installed without CLI extras")
print("fix: `uv tool uninstall scrapling` then `uv tool install 'scrapling[shell]'`")
else:
print("cause: unknown")
if output:
print("details:")
print(output[:1200])
return False
def diagnose_browsers() -> None:
print_section("Browser Runtime")
chromium, headless_shell = detect_browser_cache()
print("chromium: {0}".format("present" if chromium else "missing"))
for path in chromium:
print(" - {0}".format(path))
print("chrome-headless-shell: {0}".format("present" if headless_shell else "missing"))
for path in headless_shell:
print(" - {0}".format(path))
if not chromium or not headless_shell:
print("hint: run `scrapling install` before browser-backed fetches")
def preview_file(path: Path, preview_lines: int) -> None:
print_section("Smoke Test Output")
if not path.exists():
print("status: missing output file")
return
size = path.stat().st_size
print("path: {0}".format(path))
print("bytes: {0}".format(size))
if size == 0:
print("status: empty")
return
if path.suffix in (".md", ".txt"):
print("preview:")
with path.open("r", encoding="utf-8", errors="replace") as handle:
for index, line in enumerate(handle):
if index >= preview_lines:
break
print(line.rstrip())
def run_smoke_test(args: argparse.Namespace) -> int:
print_section("Smoke Test")
suffix = ".html"
if args.selector:
suffix = ".md"
output_path = Path(tempfile.gettempdir()) / ("scrapling-smoke" + suffix)
if output_path.exists():
output_path.unlink()
cmd = ["scrapling", "extract", "fetch" if args.dynamic else "get", args.url, str(output_path)]
if args.selector:
cmd.extend(["-s", args.selector])
if args.dynamic:
cmd.extend(["--timeout", str(args.timeout)])
elif args.no_verify:
cmd.append("--no-verify")
print("command: {0}".format(" ".join(cmd)))
code, stdout, stderr = run_command(cmd)
if stdout.strip():
print(stdout.strip())
if stderr.strip():
print(stderr.strip())
preview_file(output_path, args.preview_lines)
return code
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Diagnose Scrapling and run an optional smoke test.")
parser.add_argument("--url", help="Optional URL for a smoke test")
parser.add_argument("--selector", help="Optional CSS selector for the smoke test")
parser.add_argument(
"--dynamic",
action="store_true",
help="Use `scrapling extract fetch` instead of `scrapling extract get`",
)
parser.add_argument(
"--no-verify",
action="store_true",
help="Pass `--no-verify` to static smoke tests",
)
parser.add_argument(
"--timeout",
type=int,
default=20000,
help="Timeout in milliseconds for dynamic smoke tests",
)
parser.add_argument(
"--preview-lines",
type=int,
default=20,
help="Number of preview lines for markdown/text output",
)
return parser
def main() -> int:
parser = build_parser()
args = parser.parse_args()
cli_ok = diagnose_cli()
diagnose_browsers()
if not cli_ok:
return 1
if not args.url:
return 0
return run_smoke_test(args)
if __name__ == "__main__":
sys.exit(main())

View File

@@ -15,6 +15,13 @@ import re
import sys import sys
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple
SCRIPT_DIR = Path(__file__).resolve().parent
PACKAGE_ROOT = SCRIPT_DIR.parent
if str(PACKAGE_ROOT) not in sys.path:
sys.path.insert(0, str(PACKAGE_ROOT))
from scripts.quick_validate import validate_skill from scripts.quick_validate import validate_skill
from scripts.security_scan import calculate_skill_hash from scripts.security_scan import calculate_skill_hash
@@ -41,7 +48,7 @@ def should_exclude(rel_path: Path) -> bool:
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
def validate_security_marker(skill_path: Path) -> tuple[bool, str]: def validate_security_marker(skill_path: Path) -> Tuple[bool, str]:
""" """
Validate security marker file exists and hash matches current content Validate security marker file exists and hash matches current content