Skip to content

Commit

Permalink
improved recipe parser, added tests, cleaned up resources
Browse files Browse the repository at this point in the history
  • Loading branch information
vabene1111 committed Jun 24, 2020
1 parent b6d9839 commit 2c5e44d
Show file tree
Hide file tree
Showing 15 changed files with 794 additions and 23,609 deletions.
46 changes: 42 additions & 4 deletions cookbook/helper/recipe_url_import.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,50 @@
import json
import random
import re
from random import random
from json import JSONDecodeError

import microdata
from bs4 import BeautifulSoup
from django.http import JsonResponse
from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _

from cookbook.models import Keyword


def find_recipe_json(ld_json, url):
ld_json['org'] = str(ld_json)
def get_from_html(html_text, url):
soup = BeautifulSoup(html_text, "html.parser")

# first try finding ld+json as its most common
for ld in soup.find_all('script', type='application/ld+json'):
try:
ld_json = json.loads(ld.string)
if type(ld_json) != list:
ld_json = [ld_json]

for ld_json_item in ld_json:
# recipes type might be wrapped in @graph type
if '@graph' in ld_json_item:
for x in ld_json_item['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
ld_json_item = x

if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
return find_recipe_json(ld_json_item, url)
except JSONDecodeError:
JsonResponse({'error': True, 'msg': _('The requested site does not provided malformed data and cannot be read.')}, status=400)

# now try to find microdata
items = microdata.get_items(html_text)
for i in items:
md_json = json.loads(i.json())
if 'schema.org/Recipe' in str(md_json['type']):
return find_recipe_json(md_json['properties'], url)

return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400)


def find_recipe_json(ld_json, url):
if type(ld_json['name']) == list:
try:
ld_json['name'] = ld_json['name'][0]
Expand Down Expand Up @@ -59,7 +93,7 @@ def find_recipe_json(ld_json, url):
ingredient = " ".join(ingredient_split)

if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': round(random() * 1000)}, 'ingredient': {'text': ingredient, 'id': round(random() * 1000)}, 'original': x})
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, 'original': x})

ld_json['recipeIngredient'] = ingredients
else:
Expand Down Expand Up @@ -143,4 +177,8 @@ def find_recipe_json(ld_json, url):
else:
ld_json['prepTime'] = 0

for key in list(ld_json):
if key not in ['prepTime', 'cookTime', 'image', 'recipeInstructions', 'keywords', 'name', 'recipeIngredient']:
ld_json.pop(key, None)

return JsonResponse(ld_json)
Empty file.
28 changes: 28 additions & 0 deletions cookbook/tests/other/test_edits_recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json

from cookbook.helper.recipe_url_import import get_from_html
from cookbook.tests.test_setup import TestBase


class TestEditsRecipe(TestBase):

def test_ld_json(self):
test_list = [
{'file': 'cookbook/tests/resources/websites/ld_json_1.html', 'result_length': 3128},
{'file': 'cookbook/tests/resources/websites/ld_json_2.html', 'result_length': 1450},
{'file': 'cookbook/tests/resources/websites/ld_json_3.html', 'result_length': 1545},
{'file': 'cookbook/tests/resources/websites/ld_json_4.html', 'result_length': 1657},
{'file': 'cookbook/tests/resources/websites/ld_json_invalid.html', 'result_length': 115},
{'file': 'cookbook/tests/resources/websites/ld_json_itemList.html', 'result_length': 3131},
{'file': 'cookbook/tests/resources/websites/ld_json_multiple.html', 'result_length': 1546},
{'file': 'cookbook/tests/resources/websites/micro_data_1.html', 'result_length': 1022},
{'file': 'cookbook/tests/resources/websites/micro_data_2.html', 'result_length': 1384},
{'file': 'cookbook/tests/resources/websites/micro_data_3.html', 'result_length': 1100},
{'file': 'cookbook/tests/resources/websites/micro_data_4.html', 'result_length': 4231},
]

for test in test_list:
with open(test['file'], 'rb') as file:
parsed_content = json.loads(get_from_html(file.read(), 'test_url').content)
self.assertEqual(len(str(parsed_content)), test['result_length'])
file.close()
1,129 changes: 1 addition & 1,128 deletions cookbook/tests/resources/websites/ld_json_1.html

Large diffs are not rendered by default.

3,319 changes: 1 addition & 3,318 deletions cookbook/tests/resources/websites/ld_json_2.html

Large diffs are not rendered by default.

Loading

0 comments on commit 2c5e44d

Please sign in to comment.