claude-skills-reference/engineering/browser-automation/references/data_extraction_recipes.md

# Data Extraction Recipes

Practical patterns for extracting structured data from web pages using Playwright. Each recipe is a self-contained pattern you can adapt to your target site.

## CSS Selector Patterns for Common Structures

### E-Commerce Product Listings

```python
PRODUCT_SELECTORS = {
    "container": "div.product-card, article.product, li.product-item",
    "fields": {
        "title": "h2.product-title, h3.product-name, [data-testid='product-title']",
        "price": "span.price, .product-price, [data-testid='price']",
        "original_price": "span.original-price, .was-price, del",
        "rating": "span.rating, .star-rating, [data-rating]",
        "review_count": "span.review-count, .num-reviews",
        "image_url": "img.product-image::attr(src), img::attr(data-src)",
        "product_url": "a.product-link::attr(href), h2 a::attr(href)",
        "availability": "span.stock-status, .availability",
    }
}
```

### News/Blog Article Listings

```python
ARTICLE_SELECTORS = {
    "container": "article, div.post, div.article-card",
    "fields": {
        "headline": "h2 a, h3 a, .article-title",
        "summary": "p.excerpt, .article-summary, .post-excerpt",
        "author": "span.author, .byline, [rel='author']",
        "date": "time, span.date, .published-date",
        "category": "span.category, a.tag, .article-category",
        "url": "h2 a::attr(href), .article-title a::attr(href)",
        "image_url": "img.thumbnail::attr(src), .article-image img::attr(src)",
    }
}
```

### Job Listings

```python
JOB_SELECTORS = {
    "container": "div.job-card, li.job-listing, article.job",
    "fields": {
        "title": "h2.job-title, a.job-link, [data-testid='job-title']",
        "company": "span.company-name, .employer, [data-testid='company']",
        "location": "span.location, .job-location, [data-testid='location']",
        "salary": "span.salary, .compensation, [data-testid='salary']",
        "job_type": "span.job-type, .employment-type",
        "posted_date": "time, span.posted, .date-posted",
        "url": "a.job-link::attr(href), h2 a::attr(href)",
    }
}
```

### Search Engine Results

```python
SERP_SELECTORS = {
    "container": "div.g, .search-result, li.result",
    "fields": {
        "title": "h3, .result-title",
        "url": "a::attr(href), cite",
        "snippet": "div.VwiC3b, .result-snippet, .search-description",
        "displayed_url": "cite, .result-url",
    }
}
```

## Table Extraction Recipes

### Simple HTML Table to JSON

The most common extraction pattern. Works for any standard `<table>` with `<thead>` and `<tbody>`.

```python
async def extract_table(page, table_selector="table"):
    """Extract an HTML table into a list of dictionaries."""
    data = await page.evaluate(f"""
        (selector) => {{
            const table = document.querySelector(selector);
            if (!table) return null;

            // Get headers
            const headers = Array.from(table.querySelectorAll('thead th, thead td'))
                .map(th => th.textContent.trim());

            // If no thead, use first row as headers
            if (headers.length === 0) {{
                const firstRow = table.querySelector('tr');
                if (firstRow) {{
                    headers.push(...Array.from(firstRow.querySelectorAll('th, td'))
                        .map(cell => cell.textContent.trim()));
                }}
            }}

            // Get data rows
            const rows = Array.from(table.querySelectorAll('tbody tr'));
            return rows.map(row => {{
                const cells = Array.from(row.querySelectorAll('td'));
                const obj = {{}};
                cells.forEach((cell, i) => {{
                    if (i < headers.length) {{
                        obj[headers[i]] = cell.textContent.trim();
                    }}
                }});
                return obj;
            }});
        }}
    """, table_selector)
    return data or []
```

### Table with Links and Attributes

When table cells contain links or data attributes, not just text:

```python
async def extract_rich_table(page, table_selector="table"):
    """Extract table including links and data attributes."""
    return await page.evaluate(f"""
        (selector) => {{
            const table = document.querySelector(selector);
            if (!table) return [];

            const headers = Array.from(table.querySelectorAll('thead th'))
                .map(th => th.textContent.trim());

            return Array.from(table.querySelectorAll('tbody tr')).map(row => {{
                const obj = {{}};
                Array.from(row.querySelectorAll('td')).forEach((cell, i) => {{
                    const key = headers[i] || `col_${{i}}`;
                    obj[key] = cell.textContent.trim();

                    // Extract link if present
                    const link = cell.querySelector('a');
                    if (link) {{
                        obj[key + '_url'] = link.href;
                    }}

                    // Extract data attributes
                    for (const attr of cell.attributes) {{
                        if (attr.name.startsWith('data-')) {{
                            obj[key + '_' + attr.name] = attr.value;
                        }}
                    }}
                }});
                return obj;
            }});
        }}
    """, table_selector)
```

### Multi-Page Table (Paginated)

```python
async def extract_paginated_table(page, table_selector, next_selector, max_pages=50):
    """Extract data from a table that spans multiple pages."""
    all_rows = []
    headers = None

    for page_num in range(max_pages):
        # Extract current page
        page_data = await page.evaluate(f"""
            (selector) => {{
                const table = document.querySelector(selector);
                if (!table) return {{ headers: [], rows: [] }};

                const hs = Array.from(table.querySelectorAll('thead th'))
                    .map(th => th.textContent.trim());

                const rs = Array.from(table.querySelectorAll('tbody tr')).map(row =>
                    Array.from(row.querySelectorAll('td')).map(td => td.textContent.trim())
                );

                return {{ headers: hs, rows: rs }};
            }}
        """, table_selector)

        if headers is None and page_data["headers"]:
            headers = page_data["headers"]

        for row in page_data["rows"]:
            all_rows.append(dict(zip(headers or [], row)))

        # Check for next page
        next_btn = page.locator(next_selector)
        if await next_btn.count() == 0 or await next_btn.is_disabled():
            break

        await next_btn.click()
        await page.wait_for_load_state("networkidle")
        await page.wait_for_timeout(random.randint(800, 2000))

    return all_rows
```

## Product Listing Extraction

### Generic Listing Extractor

Works for any repeating card/list pattern:

```python
async def extract_listings(page, container_sel, field_map):
    """
    Extract data from repeating elements.

    field_map: dict mapping field names to CSS selectors.
    Special suffixes:
        ::attr(name)  — extract attribute instead of text
        ::html        — extract innerHTML
    """
    items = []
    cards = await page.query_selector_all(container_sel)

    for card in cards:
        item = {}
        for field_name, selector in field_map.items():
            try:
                if "::attr(" in selector:
                    sel, attr = selector.split("::attr(")
                    attr = attr.rstrip(")")
                    el = await card.query_selector(sel)
                    item[field_name] = await el.get_attribute(attr) if el else None
                elif selector.endswith("::html"):
                    sel = selector.replace("::html", "")
                    el = await card.query_selector(sel)
                    item[field_name] = await el.inner_html() if el else None
                else:
                    el = await card.query_selector(selector)
                    item[field_name] = (await el.text_content()).strip() if el else None
            except Exception:
                item[field_name] = None
        items.append(item)

    return items
```

### With Price Parsing

```python
import re

def parse_price(text):
    """Extract numeric price from text like '$1,234.56' or '1.234,56 EUR'."""
    if not text:
        return None
    # Remove currency symbols and whitespace
    cleaned = re.sub(r'[^\d.,]', '', text.strip())
    if not cleaned:
        return None
    # Handle European format (1.234,56)
    if ',' in cleaned and '.' in cleaned:
        if cleaned.rindex(',') > cleaned.rindex('.'):
            cleaned = cleaned.replace('.', '').replace(',', '.')
        else:
            cleaned = cleaned.replace(',', '')
    elif ',' in cleaned:
        # Could be 1,234 or 1,23 — check decimal places
        parts = cleaned.split(',')
        if len(parts[-1]) <= 2:
            cleaned = cleaned.replace(',', '.')
        else:
            cleaned = cleaned.replace(',', '')
    try:
        return float(cleaned)
    except ValueError:
        return None

async def extract_products_with_prices(page, container_sel, field_map, price_field="price"):
    """Extract listings and parse prices into floats."""
    items = await extract_listings(page, container_sel, field_map)
    for item in items:
        if price_field in item and item[price_field]:
            item[f"{price_field}_raw"] = item[price_field]
            item[price_field] = parse_price(item[price_field])
    return items
```

## Pagination Handling

### Next-Button Pagination

The most common pattern. Click "Next" until the button disappears or is disabled.

```python
async def paginate_via_next_button(page, next_selector, content_selector, max_pages=100):
    """
    Yield page objects as you paginate through results.

    next_selector: CSS selector for the "Next" button/link
    content_selector: CSS selector to wait for after navigation (confirms new page loaded)
    """
    pages_scraped = 0

    while pages_scraped < max_pages:
        yield page  # Caller extracts data from current page
        pages_scraped += 1

        next_btn = page.locator(next_selector)
        if await next_btn.count() == 0:
            break

        try:
            is_disabled = await next_btn.is_disabled()
        except Exception:
            is_disabled = True

        if is_disabled:
            break

        await next_btn.click()
        await page.wait_for_selector(content_selector, state="attached")
        await page.wait_for_timeout(random.randint(500, 1500))
```

### URL-Based Pagination

When pages follow a predictable URL pattern:

```python
async def paginate_via_url(page, url_template, start=1, max_pages=100):
    """
    Navigate through pages using URL parameters.

    url_template: URL with {page} placeholder, e.g., "https://example.com/search?page={page}"
    """
    for page_num in range(start, start + max_pages):
        url = url_template.format(page=page_num)
        response = await page.goto(url, wait_until="networkidle")

        if response and response.status == 404:
            break

        yield page, page_num
        await page.wait_for_timeout(random.randint(800, 2500))
```

### Infinite Scroll

For sites that load content as you scroll:

```python
async def paginate_via_scroll(page, item_selector, max_scrolls=100, no_change_limit=3):
    """
    Scroll to load more content until no new items appear.

    item_selector: CSS selector for individual items (used to count progress)
    no_change_limit: Stop after N scrolls with no new items
    """
    previous_count = 0
    no_change_streak = 0

    for scroll_num in range(max_scrolls):
        # Count current items
        current_count = await page.locator(item_selector).count()

        if current_count == previous_count:
            no_change_streak += 1
            if no_change_streak >= no_change_limit:
                break
        else:
            no_change_streak = 0

        previous_count = current_count

        # Scroll to bottom
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(random.randint(1000, 2500))

        # Check for "Load More" button that might appear
        load_more = page.locator("button:has-text('Load More'), button:has-text('Show More')")
        if await load_more.count() > 0 and await load_more.is_visible():
            await load_more.click()
            await page.wait_for_timeout(random.randint(1000, 2000))

    return current_count
```

### Load-More Button

Simpler variant of infinite scroll where content loads via a button:

```python
async def paginate_via_load_more(page, button_selector, item_selector, max_clicks=50):
    """Click a 'Load More' button repeatedly until it disappears."""
    for click_num in range(max_clicks):
        btn = page.locator(button_selector)
        if await btn.count() == 0 or not await btn.is_visible():
            break

        count_before = await page.locator(item_selector).count()
        await btn.click()

        # Wait for new items to appear
        try:
            await page.wait_for_function(
                f"document.querySelectorAll('{item_selector}').length > {count_before}",
                timeout=10000,
            )
        except Exception:
            break  # No new items loaded

        await page.wait_for_timeout(random.randint(500, 1500))

    return await page.locator(item_selector).count()
```

## Nested Data Extraction

### Comments with Replies (Threaded)

```python
async def extract_threaded_comments(page, parent_selector=".comments"):
    """Recursively extract threaded comments."""
    return await page.evaluate(f"""
        (parentSelector) => {{
            function extractThread(container) {{
                const comments = [];
                const directChildren = container.querySelectorAll(':scope > .comment');

                for (const comment of directChildren) {{
                    const authorEl = comment.querySelector('.author, .username');
                    const textEl = comment.querySelector('.comment-text, .comment-body');
                    const dateEl = comment.querySelector('time, .date');
                    const repliesContainer = comment.querySelector('.replies, .children');

                    comments.push({{
                        author: authorEl ? authorEl.textContent.trim() : null,
                        text: textEl ? textEl.textContent.trim() : null,
                        date: dateEl ? (dateEl.getAttribute('datetime') || dateEl.textContent.trim()) : null,
                        replies: repliesContainer ? extractThread(repliesContainer) : [],
                    }});
                }}

                return comments;
            }}

            const root = document.querySelector(parentSelector);
            return root ? extractThread(root) : [];
        }}
    """, parent_selector)
```

### Nested Categories (Sidebar/Menu)

```python
async def extract_category_tree(page, root_selector="nav.categories"):
    """Extract nested category structure from a sidebar or menu."""
    return await page.evaluate(f"""
        (rootSelector) => {{
            function extractLevel(container) {{
                const items = [];
                const directItems = container.querySelectorAll(':scope > li, :scope > div.category');

                for (const item of directItems) {{
                    const link = item.querySelector(':scope > a');
                    const subMenu = item.querySelector(':scope > ul, :scope > div.sub-categories');

                    items.push({{
                        name: link ? link.textContent.trim() : item.textContent.trim().split('\\n')[0],
                        url: link ? link.href : null,
                        children: subMenu ? extractLevel(subMenu) : [],
                    }});
                }}

                return items;
            }}

            const root = document.querySelector(rootSelector);
            return root ? extractLevel(root.querySelector('ul') || root) : [];
        }}
    """, root_selector)
```

### Accordion/Expandable Content

Some content is hidden behind accordion/expand toggles. Click to reveal, then extract.

```python
async def extract_accordion(page, toggle_selector, content_selector):
    """Expand all accordion items and extract their content."""
    items = []
    toggles = await page.query_selector_all(toggle_selector)

    for toggle in toggles:
        title = (await toggle.text_content()).strip()

        # Click to expand
        await toggle.click()
        await page.wait_for_timeout(300)

        # Find the associated content panel
        content = await toggle.evaluate_handle(
            f"el => el.closest('.accordion-item, .faq-item')?.querySelector('{content_selector}')"
        )

        body = None
        if content:
            body = (await content.text_content())
            if body:
                body = body.strip()

        items.append({"title": title, "content": body})

    return items
```

## Data Cleaning Utilities

### Post-Extraction Cleaning

```python
import re

def clean_text(text):
    """Normalize whitespace, remove zero-width characters."""
    if not text:
        return None
    # Remove zero-width characters
    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text if text else None

def clean_url(url, base_url=None):
    """Convert relative URLs to absolute."""
    if not url:
        return None
    url = url.strip()
    if url.startswith("//"):
        return "https:" + url
    if url.startswith("/") and base_url:
        return base_url.rstrip("/") + url
    return url

def deduplicate(items, key_field):
    """Remove duplicate items based on a key field."""
    seen = set()
    unique = []
    for item in items:
        key = item.get(key_field)
        if key and key not in seen:
            seen.add(key)
            unique.append(item)
    return unique
```

### Output Formats

```python
import json
import csv
import io

def to_jsonl(items, file_path):
    """Write items as JSON Lines (one JSON object per line)."""
    with open(file_path, "w") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def to_csv(items, file_path):
    """Write items as CSV."""
    if not items:
        return
    headers = list(items[0].keys())
    with open(file_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(items)

def to_json(items, file_path, indent=2):
    """Write items as a JSON array."""
    with open(file_path, "w") as f:
        json.dump(items, f, indent=indent, ensure_ascii=False)
```