Files
skill-seekers-reference/src/skill_seekers/cli/notion_scraper.py
yusyus 53b911b697 feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
2026-03-15 15:30:15 +03:00

1024 lines
44 KiB
Python

#!/usr/bin/env python3
"""
Notion Workspace to Skill Converter
Converts Notion databases and pages into AI-ready skills. Two modes:
1. **API mode** — Uses the Notion API via ``notion-client`` to fetch databases,
pages, and blocks in real time. Requires an integration token.
2. **Export mode** — Parses a Notion Markdown/CSV export directory downloaded
from Settings > Export. No token required.
Usage:
skill-seekers notion --database-id ID --token $NOTION_TOKEN --name myskill
skill-seekers notion --page-id ID --token $NOTION_TOKEN --name myskill
skill-seekers notion --export-path ./notion-export/ --name myskill
skill-seekers notion --from-json output/myskill_notion_data.json --name myskill
"""
import argparse
import csv
import json
import logging
import os
import re
import sys
import time
from pathlib import Path
from typing import Any
# Optional dependency guard — notion-client is not a core dependency
try:
from notion_client import Client as NotionClient
from notion_client import APIResponseError
NOTION_AVAILABLE = True
except ImportError:
NOTION_AVAILABLE = False
logger = logging.getLogger(__name__)
# Constants
DEFAULT_MAX_PAGES = 500
RATE_LIMIT_DELAY = 0.35 # seconds between API requests
MAX_BLOCK_DEPTH = 5
def _check_notion_deps() -> None:
"""Raise RuntimeError if notion-client is not installed."""
if not NOTION_AVAILABLE:
raise RuntimeError(
"notion-client is required for Notion API mode.\n"
'Install with: pip install "skill-seekers[notion]"\n'
"Or: pip install notion-client"
)
def infer_description_from_notion(metadata: dict | None = None, name: str = "") -> str:
"""Infer a skill description from Notion workspace metadata."""
if metadata:
desc_text = metadata.get("description", "")
if desc_text and len(desc_text) > 20:
desc = desc_text.strip()[:150]
return f"Use when {desc.lower()}"
title_text = metadata.get("title", "")
if title_text and len(title_text) > 10:
return f"Use when working with {title_text.lower()}"
return (
f"Use when referencing {name} documentation"
if name
else "Use when referencing this Notion workspace"
)
class NotionToSkillConverter:
"""Convert Notion workspace content (database or page tree) to a skill.
Args:
config: Dict with keys name, database_id, page_id, export_path,
token, description, max_pages.
"""
def __init__(self, config: dict) -> None:
self.config = config
self.name: str = config["name"]
self.database_id: str | None = config.get("database_id")
self.page_id: str | None = config.get("page_id")
self.export_path: str | None = config.get("export_path")
self.token: str | None = config.get("token") or os.getenv("NOTION_TOKEN")
self.description: str = (
config.get("description") or f"Use when referencing {self.name} documentation"
)
self.max_pages: int = config.get("max_pages", DEFAULT_MAX_PAGES)
self.skill_dir: str = f"output/{self.name}"
self.data_file: str = f"output/{self.name}_notion_data.json"
self._client: Any = None
self.extracted_data: dict[str, Any] | None = None
self._pages_fetched: int = 0
self._blocks_fetched: int = 0
# -- Notion client ---------------------------------------------------
def _get_client(self) -> Any:
"""Return a cached Notion API client, creating one if needed."""
_check_notion_deps()
if self._client is None:
if not self.token:
raise ValueError("Notion integration token required. Set NOTION_TOKEN or --token.")
self._client = NotionClient(auth=self.token)
logger.info("Notion API client initialised")
return self._client
# -- Public extraction -----------------------------------------------
def extract_notion(self) -> bool:
"""Extract content from Notion (API or export mode). Saves JSON."""
print(f"\n--- Extracting Notion content for: {self.name}")
if self.export_path:
pages, source_mode = self._extract_from_export(), "export"
elif self.database_id or self.page_id:
pages, source_mode = self._extract_via_api(), "api"
else:
raise ValueError("Must specify --database-id, --page-id, or --export-path.")
metadata: dict[str, Any] = {
"title": self.name,
"source_mode": source_mode,
"database_id": self.database_id,
"page_id": self.page_id,
"export_path": self.export_path,
}
if not self.config.get("description"):
self.description = infer_description_from_notion(metadata, self.name)
result_data: dict[str, Any] = {
"metadata": metadata,
"total_pages": len(pages),
"pages_fetched": self._pages_fetched,
"blocks_fetched": self._blocks_fetched,
"pages": pages,
}
os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
with open(self.data_file, "w", encoding="utf-8") as f:
json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
self.extracted_data = result_data
print(f" Saved extracted data to: {self.data_file}")
print(f" Extracted {len(pages)} pages, {self._blocks_fetched} blocks")
return True
# -- Load extracted data ---------------------------------------------
def load_extracted_data(self, json_path: str | None = None) -> bool:
"""Load previously extracted Notion data from JSON."""
path = json_path or self.data_file
print(f"\n Loading extracted data from: {path}")
if not os.path.exists(path):
raise FileNotFoundError(f"Data file not found: {path}")
with open(path, encoding="utf-8") as f:
self.extracted_data = json.load(f)
total = self.extracted_data.get("total_pages", len(self.extracted_data.get("pages", [])))
print(f" Loaded {total} pages")
return True
# -- Categorisation --------------------------------------------------
def categorize_content(self) -> dict[str, dict[str, Any]]:
"""Categorize pages by database properties or page hierarchy."""
if not self.extracted_data:
raise RuntimeError("No extracted data available.")
print("\n Categorizing content...")
pages = self.extracted_data.get("pages", [])
categorized: dict[str, dict[str, Any]] = {}
for page in pages:
props = page.get("properties", {})
cat_key = self._resolve_category_key(props, page.get("parent_path", ""))
cat_title = cat_key.replace("_", " ").title()
categorized.setdefault(cat_key, {"title": cat_title, "pages": []})
categorized[cat_key]["pages"].append(page)
if list(categorized.keys()) == ["other"]:
categorized = {"content": {"title": "Content", "pages": pages}}
print(f" Created {len(categorized)} categories")
for cat_data in categorized.values():
print(f" - {cat_data['title']}: {len(cat_data['pages'])} pages")
return categorized
def _resolve_category_key(self, properties: dict[str, Any], parent_path: str) -> str:
"""Determine category from properties (tags/category/type/status) or parent path."""
for name in ("category", "Category", "tags", "Tags", "type", "Type", "status", "Status"):
val = properties.get(name)
if val:
val = val[0] if isinstance(val, list) and val else val
if isinstance(val, str) and val.strip():
return self._sanitize_key(val)
if parent_path:
first = parent_path.strip("/").split("/")[0]
if first:
return self._sanitize_key(first)
return "other"
@staticmethod
def _sanitize_key(text: str) -> str:
"""Convert text to safe lowercase underscore key."""
safe = re.sub(r"[^\w\s-]", "", text.lower())
return re.sub(r"[-\s]+", "_", safe).strip("_") or "other"
# -- Skill building --------------------------------------------------
def build_skill(self) -> None:
"""Build complete skill directory (SKILL.md, references, index)."""
if not self.extracted_data:
raise RuntimeError("No extracted data available.")
print(f"\n Building skill: {self.name}")
for subdir in ("references", "scripts", "assets"):
os.makedirs(f"{self.skill_dir}/{subdir}", exist_ok=True)
categorized = self.categorize_content()
print("\n Generating reference files...")
total_cat = len(categorized)
for i, (cat_key, cat_data) in enumerate(categorized.items(), 1):
self._generate_reference_file(cat_key, cat_data, i, total_cat)
self._generate_index(categorized)
self._generate_skill_md(categorized)
print(f"\n Skill built successfully: {self.skill_dir}/")
print(f"\n Next step: Package with: skill-seekers package {self.skill_dir}/")
def _generate_reference_file(
self, cat_key: str, cat_data: dict[str, Any], section_num: int, total_sections: int
) -> None:
"""Generate a reference markdown file for one category."""
pages = cat_data["pages"]
filename = f"{self.skill_dir}/references/{cat_key}.md"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {cat_data['title']}\n\n")
for page in pages:
title = page.get("title", "Untitled")
f.write(f"---\n\n## {title}\n\n")
if page.get("url"):
f.write(f"*Source: [{page['url']}]({page['url']})*\n\n")
props = page.get("properties", {})
if props:
f.write("**Properties:**\n\n")
for pn, pv in props.items():
pv = ", ".join(str(v) for v in pv) if isinstance(pv, list) else pv
f.write(f"- **{pn}:** {pv}\n")
f.write("\n")
if page.get("content"):
f.write(f"{page['content']}\n\n")
for blk in page.get("code_blocks", []):
if blk.get("caption"):
f.write(f"*{blk['caption']}*\n\n")
f.write(f"```{blk.get('language', '')}\n{blk.get('code', '')}\n```\n\n")
print(f" Generated: {filename} ({len(pages)} pages)")
def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None:
"""Generate references/index.md."""
filename = f"{self.skill_dir}/references/index.md"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {self.name.title()} Reference Index\n\n## Categories\n\n")
for cat_key, cat_data in categorized.items():
f.write(f"- [{cat_data['title']}]({cat_key}.md) ({len(cat_data['pages'])} pages)\n")
f.write("\n## Statistics\n\n")
f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
f.write(f"- Blocks fetched: {self.extracted_data.get('blocks_fetched', 0)}\n")
f.write(
f"- Source mode: {self.extracted_data.get('metadata', {}).get('source_mode', 'unknown')}\n"
)
print(f" Generated: {filename}")
def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None:
"""Generate main SKILL.md with YAML frontmatter."""
filename = f"{self.skill_dir}/SKILL.md"
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
desc = self.description[:1024]
meta = self.extracted_data.get("metadata", {})
with open(filename, "w", encoding="utf-8") as f:
f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n")
f.write(f"# {self.name.title()} Documentation Skill\n\n{self.description}\n\n")
# Source info
f.write(
f"## Source Information\n\n**Source mode:** {meta.get('source_mode', 'unknown')}\n"
)
for key in ("database_id", "page_id", "export_path"):
if meta.get(key):
f.write(f"**{key.replace('_', ' ').title()}:** `{meta[key]}`\n")
f.write("\n## When to Use This Skill\n\nUse this skill when you need to:\n")
f.write(f"- Understand {self.name} concepts and processes\n")
f.write("- Look up structured database entries and their properties\n")
f.write("- Find code examples and implementation notes\n")
f.write("- Review documentation and knowledge base articles\n")
f.write("- Explore the workspace hierarchy and relationships\n\n")
# Content overview
f.write(
f"## Content Overview\n\n**Total Pages:** {self.extracted_data.get('total_pages', 0)}\n\n"
)
for cd in categorized.values():
f.write(f"- **{cd['title']}**: {len(cd['pages'])} pages\n")
f.write("\n")
# Key topics
topics = self._collect_key_topics()
if topics:
f.write("## Key Topics\n\n")
for t in topics[:20]:
f.write(f"- {t}\n")
f.write("\n")
# Code highlights
all_code = self._collect_code_blocks()
if all_code:
f.write("## Code Examples\n\n")
by_lang: dict[str, list[dict[str, str]]] = {}
for blk in all_code[:30]:
by_lang.setdefault(blk.get("language", "plain text"), []).append(blk)
for lang in sorted(by_lang):
exs = by_lang[lang]
f.write(f"### {lang.title()} ({len(exs)} examples)\n\n")
for blk in exs[:3]:
code = blk.get("code", "")[:500]
f.write(f"```{lang}\n{code}\n```\n\n")
# Property summary
psummary = self._collect_property_summary()
if psummary:
f.write("## Database Properties\n\n")
for pn, vals in psummary.items():
sample = ", ".join(sorted(vals)[:5])
f.write(f"- **{pn}** ({len(vals)} unique): {sample}\n")
f.write("\n")
# Navigation
f.write("## Navigation\n\n")
for ck, cd in categorized.items():
f.write(f"- `references/{ck}.md` - {cd['title']}\n")
f.write("\nSee `references/index.md` for complete reference structure.\n\n")
f.write("---\n\n**Generated by Skill Seeker** | Notion Scraper\n")
with open(filename, encoding="utf-8") as f:
line_count = len(f.read().split("\n"))
print(f" Generated: {filename} ({line_count} lines)")
# -- SKILL.md helpers ------------------------------------------------
def _collect_key_topics(self) -> list[str]:
"""Extract unique heading texts from all pages."""
topics, seen = [], set()
for page in self.extracted_data.get("pages", []):
for text in [page.get("title", "")] + [
h.get("text", "") for h in page.get("headings", [])
]:
text = text.strip()
if text and text.lower() not in seen and len(text) > 3:
seen.add(text.lower())
topics.append(text)
return topics
def _collect_code_blocks(self) -> list[dict[str, str]]:
"""Collect all code blocks from extracted pages."""
return [
blk for p in self.extracted_data.get("pages", []) for blk in p.get("code_blocks", [])
]
def _collect_property_summary(self) -> dict[str, set[str]]:
"""Collect unique property values across all pages."""
summary: dict[str, set[str]] = {}
for page in self.extracted_data.get("pages", []):
for pn, pv in page.get("properties", {}).items():
summary.setdefault(pn, set())
if isinstance(pv, list):
summary[pn].update(str(v) for v in pv)
elif pv is not None:
summary[pn].add(str(pv))
return {k: v for k, v in summary.items() if v}
# ====================================================================
# API MODE
# ====================================================================
def _extract_via_api(self) -> list[dict[str, Any]]:
"""Fetch pages from Notion via API (database query or page tree walk)."""
client = self._get_client()
if self.database_id:
print(f" Fetching database: {self.database_id}")
return self._extract_database_entries(client)
print(f" Fetching page tree: {self.page_id}")
return self._extract_page_tree(client, self.page_id, parent_path="")
def _extract_database_entries(self, client: Any) -> list[dict[str, Any]]:
"""Extract entries from a Notion database with properties."""
pages: list[dict[str, Any]] = []
has_more, cursor = True, None
# Fetch DB metadata
try:
db_meta = client.databases.retrieve(database_id=self.database_id)
logger.info(
"Database: %s",
self._extract_rich_text(db_meta.get("title", [])) or self.database_id,
)
except Exception as e:
logger.warning("Could not fetch database metadata: %s", e)
# Paginate entries
while has_more and self._pages_fetched < self.max_pages:
try:
params: dict[str, Any] = {"database_id": self.database_id}
if cursor:
params["start_cursor"] = cursor
resp = client.databases.query(**params)
has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
for entry in resp.get("results", []):
if self._pages_fetched >= self.max_pages:
break
pd = self._process_database_entry(client, entry)
if pd:
pages.append(pd)
self._pages_fetched += 1
time.sleep(RATE_LIMIT_DELAY)
logger.info(" Fetched %d entries...", self._pages_fetched)
except APIResponseError as e:
if e.status == 429:
time.sleep(10)
continue # noqa: E702
logger.error("Notion API error: %s", e)
break # noqa: E702
except Exception as e:
logger.error("Error querying database: %s", e)
break # noqa: E702
return pages
def _process_database_entry(self, client: Any, entry: dict[str, Any]) -> dict[str, Any] | None:
"""Process one database entry into a page dict."""
try:
page_id, url = entry["id"], entry.get("url", "")
props = self._extract_properties(entry.get("properties", {}))
title = props.get("Name", "") or props.get("Title", "") or "Untitled"
if isinstance(title, list):
title = ", ".join(str(t) for t in title) or "Untitled"
content, headings, code_blocks = self._fetch_page_blocks(client, page_id)
return {
"id": page_id,
"title": title,
"url": url,
"properties": props,
"content": content,
"headings": headings,
"code_blocks": code_blocks,
"parent_path": "",
}
except Exception as e:
logger.warning("Failed to process entry %s: %s", entry.get("id", "?"), e)
return None
def _extract_properties(self, raw: dict[str, Any]) -> dict[str, Any]:
"""Flatten Notion's raw property format into simple {name: value} pairs."""
result: dict[str, Any] = {}
for name, data in raw.items():
try:
val = self._extract_property_value(data.get("type", ""), data)
if val is not None:
result[name] = val
except Exception as e:
logger.debug("Could not extract property '%s': %s", name, e)
return result
def _extract_property_value(self, ptype: str, data: dict[str, Any]) -> Any:
"""Extract a single property value by its Notion type."""
if ptype == "title":
return self._extract_rich_text(data.get("title", []))
if ptype == "rich_text":
return self._extract_rich_text(data.get("rich_text", []))
if ptype == "number":
return data.get("number")
if ptype == "select":
s = data.get("select")
return s.get("name", "") if s else None
if ptype == "multi_select":
return [o.get("name", "") for o in data.get("multi_select", [])]
if ptype == "date":
d = data.get("date")
return (
(f"{d['start']} - {d['end']}" if d and d.get("end") else d.get("start"))
if d
else None
)
if ptype == "checkbox":
return data.get("checkbox", False)
if ptype in ("url", "email", "phone_number", "created_time", "last_edited_time"):
return data.get(ptype)
if ptype == "status":
s = data.get("status")
return s.get("name", "") if s else None
if ptype == "relation":
rels = data.get("relation", [])
return [r.get("id", "") for r in rels] if rels else None
if ptype == "people":
return [p.get("name", "") for p in data.get("people", [])] or None
if ptype == "files":
return [fi.get("name", "") for fi in data.get("files", [])] or None
if ptype in ("formula", "rollup"):
inner = data.get(ptype, {})
return inner.get(inner.get("type", ""))
logger.debug("Unsupported property type: %s", ptype)
return None
# -- Page tree (recursive) -------------------------------------------
def _extract_page_tree(
self, client: Any, page_id: str, parent_path: str, depth: int = 0
) -> list[dict[str, Any]]:
"""Recursively extract a page and its child pages."""
if self._pages_fetched >= self.max_pages:
return []
pages: list[dict[str, Any]] = []
try:
meta = client.pages.retrieve(page_id=page_id)
props = self._extract_properties(meta.get("properties", {}))
title = (
props.get("title", "")
or props.get("Name", "")
or props.get("Title", "")
or "Untitled"
)
if isinstance(title, list):
title = ", ".join(str(t) for t in title) or "Untitled"
current_path = f"{parent_path}/{title}" if parent_path else title
content, headings, code_blocks = self._fetch_page_blocks(client, page_id)
self._pages_fetched += 1
pages.append(
{
"id": page_id,
"title": title,
"url": meta.get("url", ""),
"properties": props,
"content": content,
"headings": headings,
"code_blocks": code_blocks,
"parent_path": parent_path,
"depth": depth,
}
)
logger.info(" [%d] %s", self._pages_fetched, current_path)
time.sleep(RATE_LIMIT_DELAY)
if depth < MAX_BLOCK_DEPTH:
for child_id in self._get_child_pages(client, page_id):
if self._pages_fetched >= self.max_pages:
break
pages.extend(self._extract_page_tree(client, child_id, current_path, depth + 1))
except APIResponseError as e:
if e.status == 429:
time.sleep(10)
return self._extract_page_tree(client, page_id, parent_path, depth)
logger.warning("API error on page %s: %s", page_id, e)
except Exception as e:
logger.warning("Error extracting page %s: %s", page_id, e)
return pages
def _get_child_pages(self, client: Any, page_id: str) -> list[str]:
"""Get IDs of child_page / child_database blocks within a page."""
ids: list[str] = []
has_more, cursor = True, None
while has_more:
try:
params: dict[str, Any] = {"block_id": page_id}
if cursor:
params["start_cursor"] = cursor
resp = client.blocks.children.list(**params)
has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
for b in resp.get("results", []):
if b.get("type") in ("child_page", "child_database"):
ids.append(b["id"])
time.sleep(RATE_LIMIT_DELAY)
except Exception as e:
logger.debug("Error listing children of %s: %s", page_id, e)
break # noqa: E702
return ids
# -- Block parsing ---------------------------------------------------
def _fetch_page_blocks(
self, client: Any, page_id: str, depth: int = 0
) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
"""Fetch all blocks for a page and convert to markdown."""
parts, headings, code_blocks = [], [], []
has_more, cursor = True, None
while has_more:
try:
params: dict[str, Any] = {"block_id": page_id}
if cursor:
params["start_cursor"] = cursor
resp = client.blocks.children.list(**params)
has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
for block in resp.get("results", []):
self._blocks_fetched += 1
md, bh, bc = self._parse_notion_blocks(client, block, depth)
if md:
parts.append(md)
headings.extend(bh)
code_blocks.extend(bc)
time.sleep(RATE_LIMIT_DELAY)
except APIResponseError as e:
if e.status == 429:
time.sleep(10)
continue # noqa: E702
logger.debug("API error fetching blocks for %s: %s", page_id, e)
break # noqa: E702
except Exception as e:
logger.debug("Error fetching blocks for %s: %s", page_id, e)
break # noqa: E702
return "\n\n".join(p for p in parts if p.strip()), headings, code_blocks
def _parse_notion_blocks(
self, client: Any, block: dict[str, Any], depth: int = 0
) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
"""Convert a Notion block to markdown, recursing into children."""
btype = block.get("type", "")
md, headings, code_blocks = self._handle_block_type(btype, block)
if block.get("has_children") and depth < MAX_BLOCK_DEPTH:
child_md, ch, cc = self._fetch_page_blocks(client, block["id"], depth + 1)
if child_md:
if btype in ("toggle", "callout"):
indented = "\n".join(f" {l}" for l in child_md.split("\n")) # noqa: E741
md = f"{md}\n{indented}" if md else indented
else:
md = f"{md}\n\n{child_md}" if md else child_md
headings.extend(ch)
code_blocks.extend(cc)
return md, headings, code_blocks
def _handle_block_type(
self, btype: str, block: dict[str, Any]
) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
"""Handle a Notion block type: paragraph, heading, code, callout, toggle, table, etc."""
headings: list[dict[str, str]] = []
code_blocks: list[dict[str, str]] = []
data = block.get(btype, {})
md = ""
if btype == "paragraph":
md = self._extract_rich_text(data.get("rich_text", []))
elif btype in ("heading_1", "heading_2", "heading_3"):
level = int(btype[-1])
text = self._extract_rich_text(data.get("rich_text", []))
md = f"{'#' * level} {text}"
if text:
headings.append({"level": f"h{level}", "text": text})
elif btype == "code":
lang = data.get("language", "plain text") or "plain text"
code_text = self._extract_rich_text(data.get("rich_text", []))
caption = self._extract_rich_text(data.get("caption", []))
md = f"```{lang}\n{code_text}\n```"
if code_text.strip():
code_blocks.append({"language": lang, "code": code_text, "caption": caption})
elif btype == "callout":
icon = data.get("icon", {})
emoji = icon.get("emoji", "") if icon else ""
text = self._extract_rich_text(data.get("rich_text", []))
md = f"> {emoji} **Callout:** {text}" if emoji else f"> **Callout:** {text}"
elif btype == "toggle":
md = f"<details>\n<summary>{self._extract_rich_text(data.get('rich_text', []))}</summary>"
elif btype == "quote":
md = f"> {self._extract_rich_text(data.get('rich_text', []))}"
elif btype == "bulleted_list_item":
md = f"- {self._extract_rich_text(data.get('rich_text', []))}"
elif btype == "numbered_list_item":
md = f"1. {self._extract_rich_text(data.get('rich_text', []))}"
elif btype == "to_do":
text = self._extract_rich_text(data.get("rich_text", []))
md = f"- [{'x' if data.get('checked') else ' '}] {text}"
elif btype == "divider":
md = "---"
elif btype == "table":
md = self._handle_table_block(block)
elif btype == "image":
itype = data.get("type", "")
url = data.get(itype, {}).get("url", "") if itype in ("external", "file") else ""
cap = self._extract_rich_text(data.get("caption", []))
md = f"![{cap or 'Image'}]({url})" if url else ""
elif btype in ("bookmark", "embed", "link_preview"):
url = data.get("url", "")
cap = (
self._extract_rich_text(data.get("caption", [])) if btype != "link_preview" else ""
)
md = f"[{cap or url}]({url})" if url else ""
elif btype == "equation":
expr = data.get("expression", "")
md = f"$$\n{expr}\n$$" if expr else ""
elif btype in ("child_page", "child_database"):
md = f"**Sub-{btype.split('_')[1]}: {data.get('title', '')}**"
elif btype in ("pdf", "video", "audio", "file"):
ftype = data.get("type", "")
url = data.get(ftype, {}).get("url", "") if ftype in ("external", "file") else ""
md = f"[{btype.title()}]({url})" if url else ""
elif btype == "link_to_page":
lt = data.get("type", "")
md = f"*[Link to page: {data.get(lt, '')}]*" if data.get(lt) else ""
elif btype in (
"column_list",
"column",
"synced_block",
"template",
"table_of_contents",
"breadcrumb",
):
md = "*[Table of Contents]*" if btype == "table_of_contents" else ""
else:
logger.debug("Unhandled block type: %s", btype)
return md, headings, code_blocks
def _handle_table_block(self, block: dict[str, Any]) -> str:
"""Convert a Notion table block into a markdown table."""
tdata = block.get("table", {})
has_header = tdata.get("has_column_header", False)
rows = block.get("_table_rows", [])
if not rows:
return f"*[Table: {tdata.get('table_width', 0)} columns]*"
lines = []
for i, row in enumerate(rows):
cells = [self._extract_rich_text(c) for c in row.get("cells", [])]
lines.append("| " + " | ".join(cells) + " |")
if i == 0 and has_header:
lines.append("| " + " | ".join("---" for _ in cells) + " |")
return "\n".join(lines)
# -- Rich text -------------------------------------------------------
def _extract_rich_text(self, rich_text_list: list[dict[str, Any]]) -> str:
"""Extract text with annotations (bold, italic, code, links) from Notion rich text."""
if not rich_text_list:
return ""
parts = []
for obj in rich_text_list:
text = obj.get("plain_text", "")
if not text:
continue
ann = obj.get("annotations", {})
if ann.get("code"):
text = f"`{text}`"
if ann.get("bold"):
text = f"**{text}**"
if ann.get("italic"):
text = f"*{text}*"
if ann.get("strikethrough"):
text = f"~~{text}~~"
if ann.get("underline"):
text = f"<u>{text}</u>"
if obj.get("href"):
text = f"[{text}]({obj['href']})"
parts.append(text)
return "".join(parts)
# ====================================================================
# EXPORT MODE
# ====================================================================
def _extract_from_export(self) -> list[dict[str, Any]]:
"""Parse a Notion Markdown/CSV export directory."""
if not self.export_path:
raise ValueError("export_path is required for export mode.")
export_dir = Path(self.export_path)
if not export_dir.exists():
raise FileNotFoundError(f"Export directory not found: {self.export_path}")
if not export_dir.is_dir():
raise ValueError(f"Export path is not a directory: {self.export_path}")
print(f" Parsing Notion export: {self.export_path}")
pages: list[dict[str, Any]] = []
for root, _dirs, files in os.walk(export_dir):
rel = str(Path(root).relative_to(export_dir))
parent = "" if rel == "." else rel
for fn in sorted(files):
if self._pages_fetched >= self.max_pages:
break
fp = Path(root) / fn
if fp.suffix.lower() == ".md":
pd = self._parse_export_markdown(fp, parent)
if pd:
pages.append(pd)
self._pages_fetched += 1 # noqa: E702
elif fp.suffix.lower() == ".csv":
for pd in self._parse_export_csv(fp, parent):
if self._pages_fetched >= self.max_pages:
break
pages.append(pd)
self._pages_fetched += 1 # noqa: E702
if self._pages_fetched >= self.max_pages:
break
print(f" Parsed {len(pages)} files from export directory")
return pages
def _parse_export_markdown(self, filepath: Path, parent_path: str) -> dict[str, Any] | None:
"""Parse a single .md file from a Notion export."""
try:
content = filepath.read_text(encoding="utf-8", errors="ignore")
except Exception as e:
logger.warning("Could not read %s: %s", filepath, e)
return None # noqa: E702
if not content.strip():
return None
lines = content.split("\n")
title = self._clean_notion_export_title(filepath.stem)
for line in lines:
if line.startswith("# "):
title = line[2:].strip()
break # noqa: E702
headings = [
{"level": f"h{len(m.group(1))}", "text": m.group(2).strip()}
for line in lines
if (m := re.match(r"^(#{2,6})\s+(.+)$", line))
]
code_blocks = [
{"language": lang or "plain text", "code": code.strip(), "caption": ""}
for lang, code in re.findall(r"```(\w*)\n(.*?)```", content, re.DOTALL)
if code.strip()
]
self._blocks_fetched += len(lines) + len(code_blocks)
body = re.sub(r"```\w*\n.*?```", "", content, flags=re.DOTALL)
body = re.sub(r"^#\s+.+$", "", body, count=1, flags=re.MULTILINE).strip()
return {
"id": str(filepath),
"title": title,
"url": "",
"properties": {},
"content": body,
"headings": headings,
"code_blocks": code_blocks,
"parent_path": parent_path,
}
def _parse_export_csv(self, filepath: Path, parent_path: str) -> list[dict[str, Any]]:
"""Parse a CSV file from a Notion database export (one page per row)."""
pages: list[dict[str, Any]] = []
try:
with open(filepath, encoding="utf-8", errors="ignore", newline="") as f:
reader = csv.DictReader(f)
if not reader.fieldnames:
return pages
title_col = reader.fieldnames[0]
for i, row in enumerate(reader):
title = row.get(title_col, f"Row {i + 1}") or f"Row {i + 1}"
props = {k: v for k, v in row.items() if k and v}
body = "\n\n".join(
f"**{k}:** {v}"
for k, v in row.items()
if k and v and k != title_col and len(str(v)) > 10
)
pages.append(
{
"id": f"{filepath}:row:{i}",
"title": title,
"url": "",
"properties": props,
"content": body,
"headings": [],
"code_blocks": [],
"parent_path": parent_path,
}
)
self._blocks_fetched += 1
except Exception as e:
logger.warning("Could not parse CSV %s: %s", filepath, e)
return pages
@staticmethod
def _clean_notion_export_title(stem: str) -> str:
"""Strip trailing Notion hex IDs from export filenames."""
cleaned = re.sub(r"\s+[0-9a-f]{16,}$", "", stem)
return cleaned.strip() or stem
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> int:
"""CLI entry point for the Notion scraper."""
from .arguments.common import add_all_standard_arguments
parser = argparse.ArgumentParser(
description="Convert Notion workspace content to AI-ready skill",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"Examples:\n"
" skill-seekers notion --database-id ID --token $NOTION_TOKEN --name myskill\n"
" skill-seekers notion --page-id ID --token $NOTION_TOKEN --name myskill\n"
" skill-seekers notion --export-path ./export/ --name myskill\n"
" skill-seekers notion --from-json output/myskill_notion_data.json --name myskill"
),
)
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Notion
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
# Notion-specific arguments
parser.add_argument(
"--database-id", type=str, help="Notion database ID (API mode)", metavar="ID"
)
parser.add_argument(
"--page-id", type=str, help="Notion page ID (API mode, recursive)", metavar="ID"
)
parser.add_argument(
"--export-path", type=str, help="Notion export directory (export mode)", metavar="PATH"
)
parser.add_argument(
"--token", type=str, help="Notion integration token (or NOTION_TOKEN env)", metavar="TOKEN"
)
parser.add_argument(
"--max-pages",
type=int,
default=DEFAULT_MAX_PAGES,
help=f"Maximum pages to extract (default: {DEFAULT_MAX_PAGES})",
metavar="N",
)
parser.add_argument(
"--from-json", type=str, help="Build from previously extracted JSON", metavar="FILE"
)
args = parser.parse_args()
# Logging
level = (
logging.WARNING
if getattr(args, "quiet", False)
else (logging.DEBUG if getattr(args, "verbose", False) else logging.INFO)
)
logging.basicConfig(level=level, format="%(message)s", force=True)
# Dry run
if getattr(args, "dry_run", False):
source = (
getattr(args, "database_id", None)
or getattr(args, "page_id", None)
or getattr(args, "export_path", None)
or getattr(args, "from_json", None)
or "(none)"
)
print(f"\n{'=' * 60}\nDRY RUN: Notion Extraction\n{'=' * 60}")
print(
f"Source: {source}\nName: {getattr(args, 'name', None) or '(auto)'}\nMax pages: {args.max_pages}"
)
return 0
# Validate
has_source = any(
getattr(args, a, None) for a in ("database_id", "page_id", "export_path", "from_json")
)
if not has_source:
parser.error("Must specify --database-id, --page-id, --export-path, or --from-json")
if not getattr(args, "name", None):
if getattr(args, "from_json", None):
args.name = Path(args.from_json).stem.replace("_notion_data", "")
elif getattr(args, "export_path", None):
args.name = Path(args.export_path).stem
else:
parser.error("--name is required when using --database-id or --page-id")
# --from-json: build only
if getattr(args, "from_json", None):
config = {
"name": args.name,
"description": getattr(args, "description", None),
"max_pages": args.max_pages,
}
try:
conv = NotionToSkillConverter(config)
conv.load_extracted_data(args.from_json)
conv.build_skill()
except Exception as e:
print(f"\n Error: {e}", file=sys.stderr)
sys.exit(1) # noqa: E702
return 0
# Full extract + build
config: dict[str, Any] = {
"name": args.name,
"database_id": getattr(args, "database_id", None),
"page_id": getattr(args, "page_id", None),
"export_path": getattr(args, "export_path", None),
"token": getattr(args, "token", None),
"description": getattr(args, "description", None),
"max_pages": args.max_pages,
}
try:
conv = NotionToSkillConverter(config)
if not conv.extract_notion():
print("\n Notion extraction failed", file=sys.stderr)
sys.exit(1) # noqa: E702
conv.build_skill()
# Run enhancement workflows if specified
try:
from skill_seekers.cli.workflow_runner import run_workflows
run_workflows(args)
except (ImportError, AttributeError):
pass
# Traditional AI enhancement
if getattr(args, "enhance_level", 0) > 0:
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
skill_dir = conv.skill_dir
if api_key:
try:
from skill_seekers.cli.enhance_skill import enhance_skill_md
enhance_skill_md(skill_dir, api_key)
except ImportError:
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
LocalSkillEnhancer(Path(skill_dir)).run(headless=True)
else:
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
LocalSkillEnhancer(Path(skill_dir)).run(headless=True)
except RuntimeError as e:
print(f"\n Error: {e}", file=sys.stderr)
sys.exit(1) # noqa: E702
except Exception as e:
print(f"\n Unexpected error: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1) # noqa: E702
return 0
if __name__ == "__main__":
sys.exit(main())