Files
Leo bf1473b1be feat(skills): add research-summarizer and docker-development agent skills
research-summarizer (product-team/):
- Structured research summarization for papers, articles, reports
- Slash commands: /research:summarize, /research:compare, /research:cite
- Python tools: extract_citations.py (5 citation formats), format_summary.py (6 templates)
- References: summary-templates.md, citation-formats.md

docker-development (engineering/):
- Dockerfile optimization, compose orchestration, container security
- Slash commands: /docker:optimize, /docker:compose, /docker:security
- Python tools: dockerfile_analyzer.py (15 rules), compose_validator.py (best practices)
- References: dockerfile-best-practices.md, compose-patterns.md

Both skills include .claude-plugin/plugin.json and follow POWERFUL tier conventions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 22:47:16 +01:00

310 lines
9.5 KiB
Python

#!/usr/bin/env python3
"""
research-summarizer: Citation Extractor
Extract and format citations from text documents. Detects DOIs, URLs,
author-year patterns, and numbered references. Outputs in APA, IEEE,
Chicago, Harvard, or MLA format.
Usage:
python scripts/extract_citations.py document.txt
python scripts/extract_citations.py document.txt --format ieee
python scripts/extract_citations.py document.txt --format apa --output json
python scripts/extract_citations.py --stdin < document.txt
"""
import argparse
import json
import re
import sys
from collections import OrderedDict
# --- Citation Detection Patterns ---
PATTERNS = {
"doi": re.compile(
r"(?:https?://doi\.org/|doi:\s*)(10\.\d{4,}/[^\s,;}\]]+)", re.IGNORECASE
),
"url": re.compile(
r"https?://[^\s,;}\])\"'>]+", re.IGNORECASE
),
"author_year": re.compile(
r"(?:^|\(|\s)([A-Z][a-z]+(?:\s(?:&|and)\s[A-Z][a-z]+)?(?:\set\sal\.?)?)\s*\((\d{4})\)",
),
"numbered_ref": re.compile(
r"^\[(\d+)\]\s+(.+)$", re.MULTILINE
),
"footnote": re.compile(
r"^\d+\.\s+([A-Z].+?(?:\d{4}).+)$", re.MULTILINE
),
}
def extract_dois(text):
"""Extract DOI references."""
citations = []
for match in PATTERNS["doi"].finditer(text):
doi = match.group(1).rstrip(".")
citations.append({
"type": "doi",
"doi": doi,
"raw": match.group(0).strip(),
"url": f"https://doi.org/{doi}",
})
return citations
def extract_urls(text):
"""Extract URL references (excluding DOI URLs already captured)."""
citations = []
for match in PATTERNS["url"].finditer(text):
url = match.group(0).rstrip(".,;)")
if "doi.org" in url:
continue
citations.append({
"type": "url",
"url": url,
"raw": url,
})
return citations
def extract_author_year(text):
"""Extract author-year citations like (Smith, 2023) or Smith & Jones (2021)."""
citations = []
for match in PATTERNS["author_year"].finditer(text):
author = match.group(1).strip()
year = match.group(2)
citations.append({
"type": "author_year",
"author": author,
"year": year,
"raw": f"{author} ({year})",
})
return citations
def extract_numbered_refs(text):
"""Extract numbered reference list entries like [1] Author. Title..."""
citations = []
for match in PATTERNS["numbered_ref"].finditer(text):
num = match.group(1)
content = match.group(2).strip()
citations.append({
"type": "numbered",
"number": int(num),
"content": content,
"raw": f"[{num}] {content}",
})
return citations
def deduplicate(citations):
"""Remove duplicate citations based on raw text."""
seen = OrderedDict()
for c in citations:
key = c.get("doi") or c.get("url") or c.get("raw", "")
key = key.lower().strip()
if key and key not in seen:
seen[key] = c
return list(seen.values())
def classify_source(citation):
"""Classify citation as primary, secondary, or tertiary."""
raw = citation.get("content", citation.get("raw", "")).lower()
if any(kw in raw for kw in ["meta-analysis", "systematic review", "literature review", "survey of"]):
return "secondary"
if any(kw in raw for kw in ["textbook", "encyclopedia", "handbook", "dictionary"]):
return "tertiary"
return "primary"
# --- Formatting ---
def format_apa(citation):
"""Format citation in APA 7 style."""
if citation["type"] == "doi":
return f"https://doi.org/{citation['doi']}"
if citation["type"] == "url":
return f"Retrieved from {citation['url']}"
if citation["type"] == "author_year":
return f"{citation['author']} ({citation['year']})."
if citation["type"] == "numbered":
return citation["content"]
return citation.get("raw", "")
def format_ieee(citation):
"""Format citation in IEEE style."""
if citation["type"] == "doi":
return f"doi: {citation['doi']}"
if citation["type"] == "url":
return f"[Online]. Available: {citation['url']}"
if citation["type"] == "author_year":
return f"{citation['author']}, {citation['year']}."
if citation["type"] == "numbered":
return f"[{citation['number']}] {citation['content']}"
return citation.get("raw", "")
def format_chicago(citation):
"""Format citation in Chicago style."""
if citation["type"] == "doi":
return f"https://doi.org/{citation['doi']}."
if citation["type"] == "url":
return f"{citation['url']}."
if citation["type"] == "author_year":
return f"{citation['author']}. {citation['year']}."
if citation["type"] == "numbered":
return citation["content"]
return citation.get("raw", "")
def format_harvard(citation):
"""Format citation in Harvard style."""
if citation["type"] == "doi":
return f"doi:{citation['doi']}"
if citation["type"] == "url":
return f"Available at: {citation['url']}"
if citation["type"] == "author_year":
return f"{citation['author']} ({citation['year']})"
if citation["type"] == "numbered":
return citation["content"]
return citation.get("raw", "")
def format_mla(citation):
"""Format citation in MLA 9 style."""
if citation["type"] == "doi":
return f"doi:{citation['doi']}."
if citation["type"] == "url":
return f"{citation['url']}."
if citation["type"] == "author_year":
return f"{citation['author']}. {citation['year']}."
if citation["type"] == "numbered":
return citation["content"]
return citation.get("raw", "")
FORMATTERS = {
"apa": format_apa,
"ieee": format_ieee,
"chicago": format_chicago,
"harvard": format_harvard,
"mla": format_mla,
}
# --- Demo Data ---
DEMO_TEXT = """
Recent studies in product management have shown significant shifts in methodology.
According to Smith & Jones (2023), agile adoption has increased by 47% since 2020.
Patel et al. (2022) found that cross-functional teams deliver 2.3x faster.
Several frameworks have been proposed:
[1] Cagan, M. Inspired: How to Create Tech Products Customers Love. Wiley, 2018.
[2] Torres, T. Continuous Discovery Habits. Product Talk LLC, 2021.
[3] Gothelf, J. & Seiden, J. Lean UX. O'Reilly Media, 2021. doi: 10.1234/leanux.2021
For further reading, see https://www.svpg.com/articles/ and the meta-analysis
by Chen (2024) on product discovery effectiveness.
Related work: doi: 10.1145/3544548.3581388
"""
def run_extraction(text, fmt, output_mode):
"""Run full extraction pipeline."""
all_citations = []
all_citations.extend(extract_dois(text))
all_citations.extend(extract_author_year(text))
all_citations.extend(extract_numbered_refs(text))
all_citations.extend(extract_urls(text))
citations = deduplicate(all_citations)
for c in citations:
c["classification"] = classify_source(c)
formatter = FORMATTERS.get(fmt, format_apa)
if output_mode == "json":
result = {
"format": fmt,
"total": len(citations),
"citations": [],
}
for i, c in enumerate(citations, 1):
result["citations"].append({
"index": i,
"type": c["type"],
"classification": c["classification"],
"formatted": formatter(c),
"raw": c.get("raw", ""),
})
print(json.dumps(result, indent=2))
else:
print(f"Citations ({fmt.upper()}) — {len(citations)} found\n")
primary = [c for c in citations if c["classification"] == "primary"]
secondary = [c for c in citations if c["classification"] == "secondary"]
tertiary = [c for c in citations if c["classification"] == "tertiary"]
for label, group in [("Primary Sources", primary), ("Secondary Sources", secondary), ("Tertiary Sources", tertiary)]:
if group:
print(f"### {label}")
for i, c in enumerate(group, 1):
print(f" {i}. {formatter(c)}")
print()
return citations
def main():
parser = argparse.ArgumentParser(
description="research-summarizer: Extract and format citations from text"
)
parser.add_argument("file", nargs="?", help="Input text file (omit for demo)")
parser.add_argument(
"--format", "-f",
choices=["apa", "ieee", "chicago", "harvard", "mla"],
default="apa",
help="Citation format (default: apa)",
)
parser.add_argument(
"--output", "-o",
choices=["text", "json"],
default="text",
help="Output mode (default: text)",
)
parser.add_argument(
"--stdin",
action="store_true",
help="Read from stdin instead of file",
)
args = parser.parse_args()
if args.stdin:
text = sys.stdin.read()
elif args.file:
try:
with open(args.file, "r", encoding="utf-8") as f:
text = f.read()
except FileNotFoundError:
print(f"Error: File not found: {args.file}", file=sys.stderr)
sys.exit(1)
except IOError as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
else:
print("No input file provided. Running demo...\n")
text = DEMO_TEXT
run_extraction(text, args.format, args.output)
if __name__ == "__main__":
main()