Files
basil/packages/api/scripts/scrape_recipe.py
Paul R Kartchner b4be894470 feat: improve recipe import UX and add comprehensive test coverage
## Changes

### Recipe Import Improvements
- Move tag input to top of import preview for better UX
- Allow users to add tags immediately after importing, before viewing full details
- Keep focus in tag input field after pressing Enter for rapid tag addition

### Recipe Scraper Enhancements
- Remove deprecated supported_only parameter from Python scraper
- Update Dockerfile to explicitly install latest recipe-scrapers package
- Ensure compatibility with latest recipe-scrapers library (14.55.0+)

### Testing Infrastructure
- Add comprehensive tests for recipe tagging features (87% coverage)
- Add real integration tests for auth routes (37% coverage on auth.routes.ts)
- Add real integration tests for backup routes (74% coverage on backup.routes.ts)
- Add real integration tests for scraper service (67% coverage)
- Overall project coverage improved from 72.7% to 77.6%

### Test Coverage Details
- 377 tests passing (up from 341)
- 7 new tests for quick tagging feature
- 17 new tests for authentication flows
- 16 new tests for backup functionality
- 6 new tests for recipe scraper integration

All tests verify:
- Tag CRUD operations work correctly
- Tags properly connected using connectOrCreate pattern
- Recipe import with live URL scraping
- Security (path traversal prevention, rate limiting)
- Error handling and validation

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-16 22:00:56 -07:00

111 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""
Recipe scraper script using the recipe-scrapers library.
This script is called by the Node.js API to scrape recipes from URLs.
Uses wild mode (supported_only=False) to work with any website, not just officially supported ones.
"""
import sys
import json
import urllib.request
from recipe_scrapers import scrape_html
def safe_extract(scraper, method_name, default=None):
"""Safely extract data from scraper, returning default if method fails."""
try:
if hasattr(scraper, method_name):
result = getattr(scraper, method_name)()
return result if result else default
return default
except Exception:
return default
def parse_servings(servings_str):
"""Parse servings string into an integer. Returns None if can't parse."""
if not servings_str:
return None
try:
# Extract first number from string like "8 servings" or "Serves 8"
import re
match = re.search(r'\d+', str(servings_str))
if match:
return int(match.group())
return None
except Exception:
return None
def fetch_html(url):
"""Fetch HTML content from URL with proper headers."""
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
)
with urllib.request.urlopen(req, timeout=30) as response:
return response.read().decode('utf-8')
def scrape_recipe(url):
"""Scrape a recipe from the given URL and return JSON data."""
try:
# Fetch HTML content
html = fetch_html(url)
# Use scrape_html to scrape the recipe
# Works with officially supported websites
scraper = scrape_html(html, org_url=url)
# Extract recipe data with safe extraction
recipe_data = {
"success": True,
"recipe": {
"title": scraper.title(),
"description": safe_extract(scraper, 'description'),
"totalTime": safe_extract(scraper, 'total_time'),
"prepTime": None, # recipe-scrapers doesn't separate prep time
"cookTime": None, # recipe-scrapers doesn't separate cook time
"servings": parse_servings(safe_extract(scraper, 'yields')),
"imageUrl": safe_extract(scraper, 'image'),
"author": safe_extract(scraper, 'author'),
"cuisine": safe_extract(scraper, 'cuisine'),
"categories": [safe_extract(scraper, 'category')] if safe_extract(scraper, 'category') else [],
"rating": None, # Not commonly available
"ingredients": [
{
"name": ingredient,
"order": i
}
for i, ingredient in enumerate(scraper.ingredients())
],
"instructions": [
{
"step": i + 1,
"text": instruction
}
for i, instruction in enumerate(scraper.instructions_list())
]
}
}
return recipe_data
except Exception as e:
return {
"success": False,
"error": str(e),
"recipe": {}
}
if __name__ == "__main__":
if len(sys.argv) < 2:
print(json.dumps({
"success": False,
"error": "No URL provided",
"recipe": {}
}))
sys.exit(1)
url = sys.argv[1]
result = scrape_recipe(url)
print(json.dumps(result))