-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
131 lines (110 loc) · 4.98 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import requests
import simplejson as json
from time import sleep
from random import random
def write_file(data):
# Writes recipes to the output file "output.txt" in json format
path = "output.txt"
f = open(path, "w")
json.dump(data, f, indent=2)
f.close()
def get_ingredients(ingredient_list):
# Creates an list of ingredients from the recipe by extracting 'wholeLine' key from each ingredient object
ingredients = []
for ingredient in ingredient_list:
ingredients.append(ingredient['wholeLine'])
return ingredients
def get_tag_details(tag_list, tag_key):
# Helper function for get_tags method to extract different tags by tag_key
tag_details = []
if tag_list.get(tag_key):
for tag in tag_list[tag_key]:
tag_details.append(tag['display-name'])
return tag_details
def get_tags(tag_list):
# Creates a list of tags from the recipe by calling get_tag_details and passing in the tag object key
difficulty = get_tag_details(tag_list, 'difficulty')
nutrition = get_tag_details(tag_list, 'nutrition')
technique = get_tag_details(tag_list, 'technique')
recipe_tags = difficulty + nutrition + technique
return recipe_tags
def get_recipe_type(recipe_type_list):
# Creates a list of recipe type categories by extracting the course information from the recipe object
recipe_types = []
if recipe_type_list.get('course'):
for recipe_type in recipe_type_list['course']:
recipe_types.append(recipe_type['display-name'])
return recipe_types
def create_recipe_object(recipe_object):
# Helper function creates recipe dictionary with correct fields extracted from the recipe json object
recipe_content = recipe_object['content']
recipe_display = recipe_object['display']
recipe = {}
recipe['recipeUrl'] = recipe_content['details']['attribution']['url']
recipe['recipeName'] = recipe_display['displayName']
recipe['recipePhoto'] = recipe_display['images'][0]
recipe['ingredients'] = get_ingredients(recipe_content['ingredientLines'])
recipe['ratings'] = recipe_content['reviews']['averageRating']
recipe['cookTime'] = recipe_content['details']['totalTime']
recipe['serve'] = recipe_content['details']['numberOfServings']
# Bonus fields
recipe['tags'] = get_tags(recipe_content['tags'])
recipe['recipeType'] = get_recipe_type(recipe_content['tags'])
return recipe
def yummly_crawler(recipe_data, recipes_store, searched_recipes):
# Creates and appends recipe objects to be written to the output.txt file
# Sends appropriate recipe json object to the create_recipe_object function
recipe_list = recipe_data.get('feed')
# Error checking to make sure that correct data is received
if recipe_list is None:
return
for recipe_object in recipe_list:
# Checks if recipe has already been visited by storing recipe_ids in searched_recipes
recipe_id = recipe_object['content']['details']['id']
if recipe_id not in searched_recipes:
new_recipe = create_recipe_object(recipe_object)
recipes_store.append(new_recipe)
searched_recipes.add(recipe_id)
def make_yummly_request(recipe_start):
# Makes AJAX request to Yummly with start parameter updated on each iteration and return json response
url = 'https://mapi.yummly.com/mapi/v16/content/search'
params = {
'solr.seo_bost': 'new',
'start': f'{recipe_start}',
'maxResult': '36',
'fetchUserCollections': 'false',
'allowedContent': 'related_search',
'guided-search': 'true',
'solr.view_type': 'search_internal',
}
# Handle network errors
for attempt in range(5):
try:
response = requests.get(
url=url,
params=params,
)
return response.json()
except requests.exceptions.RequestException as e:
print(e)
continue
def start(min_num_recipes=100):
# Main function that gets called and verifies that at least the minimum number of recipes are extracted
recipe_start_idx = 0
# Store holds all the recipes for output to text file
recipes_store = []
# Searched recipes ensures that duplicate recipes are not included in final output
searched_recipes = set()
while min_num_recipes > len(recipes_store):
# data holds json response from AJAX request, data is then passed to yummly crawler to extract recipe details
data = make_yummly_request(recipe_start_idx)
yummly_crawler(data, recipes_store, searched_recipes)
recipe_start_idx += 36
# Sleep mocks user behavior to avoid being blocked for spam
sleep(random() * 3)
print(f'Success! {len(recipes_store)} recipes have been scraped')
write_file(recipes_store)
# Update MIN_NUM_RECIPES to get at least that many recipes
# To run web scraper execute "python3 main.py" in terminal, results will be added to output.txt file
MIN_NUM_RECIPES = 1000
start(MIN_NUM_RECIPES)