basil/packages/api/scripts/scrape_recipe.py

#!/usr/bin/env python3
"""
Recipe scraper script using the recipe-scrapers library.
This script is called by the Node.js API to scrape recipes from URLs.
Uses wild mode (supported_only=False) to work with any website that uses schema.org structured data.
"""

import sys
import json
import urllib.request
from recipe_scrapers import scrape_html

def safe_extract(scraper, method_name, default=None):
    """Safely extract data from scraper, returning default if method fails."""
    try:
        if hasattr(scraper, method_name):
            result = getattr(scraper, method_name)()
            return result if result else default
        return default
    except Exception:
        return default

def parse_servings(servings_str):
    """Parse servings string into an integer. Returns None if can't parse."""
    if not servings_str:
        return None
    try:
        # Extract first number from string like "8 servings" or "Serves 8"
        import re
        match = re.search(r'\d+', str(servings_str))
        if match:
            return int(match.group())
        return None
    except Exception:
        return None

def fetch_html(url):
    """Fetch HTML content from URL with proper headers."""
    req = urllib.request.Request(
        url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    )
    with urllib.request.urlopen(req, timeout=30) as response:
        return response.read().decode('utf-8')

def scrape_recipe(url):
    """Scrape a recipe from the given URL and return JSON data."""
    try:
        # Fetch HTML content
        html = fetch_html(url)

        # Use scrape_html to scrape the recipe
        # supported_only=False enables wild mode for any website with schema.org data
        scraper = scrape_html(html, org_url=url, supported_only=False)

        # Extract recipe data with safe extraction
        recipe_data = {
            "success": True,
            "recipe": {
                "title": scraper.title(),
                "description": safe_extract(scraper, 'description'),
                "totalTime": safe_extract(scraper, 'total_time'),
                "prepTime": None,  # recipe-scrapers doesn't separate prep time
                "cookTime": None,  # recipe-scrapers doesn't separate cook time
                "servings": parse_servings(safe_extract(scraper, 'yields')),
                "imageUrl": safe_extract(scraper, 'image'),
                "author": safe_extract(scraper, 'author'),
                "cuisine": safe_extract(scraper, 'cuisine'),
                "categories": [safe_extract(scraper, 'category')] if safe_extract(scraper, 'category') else [],
                "rating": None,  # Not commonly available
                "ingredients": [
                    {
                        "name": ingredient,
                        "order": i
                    }
                    for i, ingredient in enumerate(scraper.ingredients())
                ],
                "instructions": [
                    {
                        "step": i + 1,
                        "text": instruction
                    }
                    for i, instruction in enumerate(scraper.instructions_list())
                ]
            }
        }

        return recipe_data

    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "recipe": {}
        }

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(json.dumps({
            "success": False,
            "error": "No URL provided",
            "recipe": {}
        }))
        sys.exit(1)

    url = sys.argv[1]
    result = scrape_recipe(url)
    print(json.dumps(result))