research-summarizer (product-team/): - Structured research summarization for papers, articles, reports - Slash commands: /research:summarize, /research:compare, /research:cite - Python tools: extract_citations.py (5 citation formats), format_summary.py (6 templates) - References: summary-templates.md, citation-formats.md docker-development (engineering/): - Dockerfile optimization, compose orchestration, container security - Slash commands: /docker:optimize, /docker:compose, /docker:security - Python tools: dockerfile_analyzer.py (15 rules), compose_validator.py (best practices) - References: dockerfile-best-practices.md, compose-patterns.md Both skills include .claude-plugin/plugin.json and follow POWERFUL tier conventions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
310 lines
9.5 KiB
Python
310 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
research-summarizer: Citation Extractor
|
|
|
|
Extract and format citations from text documents. Detects DOIs, URLs,
|
|
author-year patterns, and numbered references. Outputs in APA, IEEE,
|
|
Chicago, Harvard, or MLA format.
|
|
|
|
Usage:
|
|
python scripts/extract_citations.py document.txt
|
|
python scripts/extract_citations.py document.txt --format ieee
|
|
python scripts/extract_citations.py document.txt --format apa --output json
|
|
python scripts/extract_citations.py --stdin < document.txt
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import OrderedDict
|
|
|
|
|
|
# --- Citation Detection Patterns ---
|
|
|
|
PATTERNS = {
|
|
"doi": re.compile(
|
|
r"(?:https?://doi\.org/|doi:\s*)(10\.\d{4,}/[^\s,;}\]]+)", re.IGNORECASE
|
|
),
|
|
"url": re.compile(
|
|
r"https?://[^\s,;}\])\"'>]+", re.IGNORECASE
|
|
),
|
|
"author_year": re.compile(
|
|
r"(?:^|\(|\s)([A-Z][a-z]+(?:\s(?:&|and)\s[A-Z][a-z]+)?(?:\set\sal\.?)?)\s*\((\d{4})\)",
|
|
),
|
|
"numbered_ref": re.compile(
|
|
r"^\[(\d+)\]\s+(.+)$", re.MULTILINE
|
|
),
|
|
"footnote": re.compile(
|
|
r"^\d+\.\s+([A-Z].+?(?:\d{4}).+)$", re.MULTILINE
|
|
),
|
|
}
|
|
|
|
|
|
def extract_dois(text):
|
|
"""Extract DOI references."""
|
|
citations = []
|
|
for match in PATTERNS["doi"].finditer(text):
|
|
doi = match.group(1).rstrip(".")
|
|
citations.append({
|
|
"type": "doi",
|
|
"doi": doi,
|
|
"raw": match.group(0).strip(),
|
|
"url": f"https://doi.org/{doi}",
|
|
})
|
|
return citations
|
|
|
|
|
|
def extract_urls(text):
|
|
"""Extract URL references (excluding DOI URLs already captured)."""
|
|
citations = []
|
|
for match in PATTERNS["url"].finditer(text):
|
|
url = match.group(0).rstrip(".,;)")
|
|
if "doi.org" in url:
|
|
continue
|
|
citations.append({
|
|
"type": "url",
|
|
"url": url,
|
|
"raw": url,
|
|
})
|
|
return citations
|
|
|
|
|
|
def extract_author_year(text):
|
|
"""Extract author-year citations like (Smith, 2023) or Smith & Jones (2021)."""
|
|
citations = []
|
|
for match in PATTERNS["author_year"].finditer(text):
|
|
author = match.group(1).strip()
|
|
year = match.group(2)
|
|
citations.append({
|
|
"type": "author_year",
|
|
"author": author,
|
|
"year": year,
|
|
"raw": f"{author} ({year})",
|
|
})
|
|
return citations
|
|
|
|
|
|
def extract_numbered_refs(text):
|
|
"""Extract numbered reference list entries like [1] Author. Title..."""
|
|
citations = []
|
|
for match in PATTERNS["numbered_ref"].finditer(text):
|
|
num = match.group(1)
|
|
content = match.group(2).strip()
|
|
citations.append({
|
|
"type": "numbered",
|
|
"number": int(num),
|
|
"content": content,
|
|
"raw": f"[{num}] {content}",
|
|
})
|
|
return citations
|
|
|
|
|
|
def deduplicate(citations):
|
|
"""Remove duplicate citations based on raw text."""
|
|
seen = OrderedDict()
|
|
for c in citations:
|
|
key = c.get("doi") or c.get("url") or c.get("raw", "")
|
|
key = key.lower().strip()
|
|
if key and key not in seen:
|
|
seen[key] = c
|
|
return list(seen.values())
|
|
|
|
|
|
def classify_source(citation):
|
|
"""Classify citation as primary, secondary, or tertiary."""
|
|
raw = citation.get("content", citation.get("raw", "")).lower()
|
|
if any(kw in raw for kw in ["meta-analysis", "systematic review", "literature review", "survey of"]):
|
|
return "secondary"
|
|
if any(kw in raw for kw in ["textbook", "encyclopedia", "handbook", "dictionary"]):
|
|
return "tertiary"
|
|
return "primary"
|
|
|
|
|
|
# --- Formatting ---
|
|
|
|
def format_apa(citation):
|
|
"""Format citation in APA 7 style."""
|
|
if citation["type"] == "doi":
|
|
return f"https://doi.org/{citation['doi']}"
|
|
if citation["type"] == "url":
|
|
return f"Retrieved from {citation['url']}"
|
|
if citation["type"] == "author_year":
|
|
return f"{citation['author']} ({citation['year']})."
|
|
if citation["type"] == "numbered":
|
|
return citation["content"]
|
|
return citation.get("raw", "")
|
|
|
|
|
|
def format_ieee(citation):
|
|
"""Format citation in IEEE style."""
|
|
if citation["type"] == "doi":
|
|
return f"doi: {citation['doi']}"
|
|
if citation["type"] == "url":
|
|
return f"[Online]. Available: {citation['url']}"
|
|
if citation["type"] == "author_year":
|
|
return f"{citation['author']}, {citation['year']}."
|
|
if citation["type"] == "numbered":
|
|
return f"[{citation['number']}] {citation['content']}"
|
|
return citation.get("raw", "")
|
|
|
|
|
|
def format_chicago(citation):
|
|
"""Format citation in Chicago style."""
|
|
if citation["type"] == "doi":
|
|
return f"https://doi.org/{citation['doi']}."
|
|
if citation["type"] == "url":
|
|
return f"{citation['url']}."
|
|
if citation["type"] == "author_year":
|
|
return f"{citation['author']}. {citation['year']}."
|
|
if citation["type"] == "numbered":
|
|
return citation["content"]
|
|
return citation.get("raw", "")
|
|
|
|
|
|
def format_harvard(citation):
|
|
"""Format citation in Harvard style."""
|
|
if citation["type"] == "doi":
|
|
return f"doi:{citation['doi']}"
|
|
if citation["type"] == "url":
|
|
return f"Available at: {citation['url']}"
|
|
if citation["type"] == "author_year":
|
|
return f"{citation['author']} ({citation['year']})"
|
|
if citation["type"] == "numbered":
|
|
return citation["content"]
|
|
return citation.get("raw", "")
|
|
|
|
|
|
def format_mla(citation):
|
|
"""Format citation in MLA 9 style."""
|
|
if citation["type"] == "doi":
|
|
return f"doi:{citation['doi']}."
|
|
if citation["type"] == "url":
|
|
return f"{citation['url']}."
|
|
if citation["type"] == "author_year":
|
|
return f"{citation['author']}. {citation['year']}."
|
|
if citation["type"] == "numbered":
|
|
return citation["content"]
|
|
return citation.get("raw", "")
|
|
|
|
|
|
FORMATTERS = {
|
|
"apa": format_apa,
|
|
"ieee": format_ieee,
|
|
"chicago": format_chicago,
|
|
"harvard": format_harvard,
|
|
"mla": format_mla,
|
|
}
|
|
|
|
|
|
# --- Demo Data ---
|
|
|
|
DEMO_TEXT = """
|
|
Recent studies in product management have shown significant shifts in methodology.
|
|
According to Smith & Jones (2023), agile adoption has increased by 47% since 2020.
|
|
Patel et al. (2022) found that cross-functional teams deliver 2.3x faster.
|
|
|
|
Several frameworks have been proposed:
|
|
[1] Cagan, M. Inspired: How to Create Tech Products Customers Love. Wiley, 2018.
|
|
[2] Torres, T. Continuous Discovery Habits. Product Talk LLC, 2021.
|
|
[3] Gothelf, J. & Seiden, J. Lean UX. O'Reilly Media, 2021. doi: 10.1234/leanux.2021
|
|
|
|
For further reading, see https://www.svpg.com/articles/ and the meta-analysis
|
|
by Chen (2024) on product discovery effectiveness.
|
|
|
|
Related work: doi: 10.1145/3544548.3581388
|
|
"""
|
|
|
|
|
|
def run_extraction(text, fmt, output_mode):
|
|
"""Run full extraction pipeline."""
|
|
all_citations = []
|
|
all_citations.extend(extract_dois(text))
|
|
all_citations.extend(extract_author_year(text))
|
|
all_citations.extend(extract_numbered_refs(text))
|
|
all_citations.extend(extract_urls(text))
|
|
|
|
citations = deduplicate(all_citations)
|
|
|
|
for c in citations:
|
|
c["classification"] = classify_source(c)
|
|
|
|
formatter = FORMATTERS.get(fmt, format_apa)
|
|
|
|
if output_mode == "json":
|
|
result = {
|
|
"format": fmt,
|
|
"total": len(citations),
|
|
"citations": [],
|
|
}
|
|
for i, c in enumerate(citations, 1):
|
|
result["citations"].append({
|
|
"index": i,
|
|
"type": c["type"],
|
|
"classification": c["classification"],
|
|
"formatted": formatter(c),
|
|
"raw": c.get("raw", ""),
|
|
})
|
|
print(json.dumps(result, indent=2))
|
|
else:
|
|
print(f"Citations ({fmt.upper()}) — {len(citations)} found\n")
|
|
primary = [c for c in citations if c["classification"] == "primary"]
|
|
secondary = [c for c in citations if c["classification"] == "secondary"]
|
|
tertiary = [c for c in citations if c["classification"] == "tertiary"]
|
|
|
|
for label, group in [("Primary Sources", primary), ("Secondary Sources", secondary), ("Tertiary Sources", tertiary)]:
|
|
if group:
|
|
print(f"### {label}")
|
|
for i, c in enumerate(group, 1):
|
|
print(f" {i}. {formatter(c)}")
|
|
print()
|
|
|
|
return citations
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="research-summarizer: Extract and format citations from text"
|
|
)
|
|
parser.add_argument("file", nargs="?", help="Input text file (omit for demo)")
|
|
parser.add_argument(
|
|
"--format", "-f",
|
|
choices=["apa", "ieee", "chicago", "harvard", "mla"],
|
|
default="apa",
|
|
help="Citation format (default: apa)",
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
choices=["text", "json"],
|
|
default="text",
|
|
help="Output mode (default: text)",
|
|
)
|
|
parser.add_argument(
|
|
"--stdin",
|
|
action="store_true",
|
|
help="Read from stdin instead of file",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.stdin:
|
|
text = sys.stdin.read()
|
|
elif args.file:
|
|
try:
|
|
with open(args.file, "r", encoding="utf-8") as f:
|
|
text = f.read()
|
|
except FileNotFoundError:
|
|
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except IOError as e:
|
|
print(f"Error reading file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
print("No input file provided. Running demo...\n")
|
|
text = DEMO_TEXT
|
|
|
|
run_extraction(text, args.format, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|