-
Notifications
You must be signed in to change notification settings - Fork 1
/
04-scrape_ingredients.py
77 lines (59 loc) · 2.12 KB
/
04-scrape_ingredients.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json
import re
from pathlib import Path
import polars as pl
import requests
from bs4 import BeautifulSoup
from statistics import mean
from mixed_fractions import Mixed
CLEANED_RECIPES_CSV = Path("data/recipes-1.csv")
INGREDIENTS_CSV = Path("data/ingredients-1.csv")
def recipe_link(name, r_id):
name = name.lower().replace(" ", "-")
return f"https://www.food.com/recipe/{name}-{r_id}?scale=1&units=us"
def clean_text(text):
text = re.sub(r"\s+", " ", text.strip())
text = re.sub(r"<a[^>]*>|</a>", "", text)
return text
def clean_quantity(text):
if text == "":
return ""
def fraction_to_float(text):
text = text.replace("⁄", "/") # Replace special character with '/'
return float(Mixed(text))
text = re.sub(r"\s+", " ", text.strip())
if "-" in text:
text = text.split("-")
number = mean([fraction_to_float(t) for t in text])
else:
number = fraction_to_float(text)
return number
def scrape_ingredients(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
ingredients = soup.find_all("ul", class_="ingredient-list").pop()
output = {}
for ingredient in ingredients.find_all("li"):
try:
quantity = ingredient.find("span", class_="ingredient-quantity").text
name = ingredient.find("span", class_="ingredient-text").text
output[clean_text(name)] = clean_quantity(quantity)
except AttributeError:
pass
return json.dumps(output)
if __name__ == "__main__":
recipes = pl.scan_csv(CLEANED_RECIPES_CSV).select(["name", "id"]).collect()
print("Generating links...")
recipes = recipes.with_columns(
pl.struct(["name", "id"])
.map_elements(lambda x: recipe_link(x["name"], x["id"]), strategy="threading")
.alias("link")
)
print("Scraping ingredients...")
recipes = recipes.with_columns(
pl.col("link")
.map_elements(lambda x: scrape_ingredients(x), strategy="threading")
.alias("ingredients")
)
print("Saving...")
recipes.sort("id").write_csv(INGREDIENTS_CSV)