Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve text to ingredient parsing #277

Merged
merged 3 commits into from
Dec 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions cookbook/helper/ingredient_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import unicodedata
import string

def parse_fraction(x):
if len(x) == 1 and 'fraction' in unicodedata.decomposition(x):
frac_split = unicodedata.decomposition(x[-1:]).split()
return float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', ''))
else:
frac_split = x.split('/')
if not len(frac_split) == 2:
raise ValueError
try:
return int(frac_split[0]) / int(frac_split[1])
except ZeroDivisionError:
raise ValueError

def parse_amount(x):
amount = 0
unit = ''

did_check_frac = False
end = 0
while end < len(x) and (x[end] in string.digits or ((x[end] == '.' or x[end] == ',') and end + 1 < len(x) and x[end+1] in string.digits)):
end += 1
if end > 0:
amount = float(x[:end].replace(',', '.'))
else:
amount = parse_fraction(x[0])
end += 1
did_check_frac = True
if end < len(x):
if did_check_frac:
unit = x[end:]
else:
try:
amount += parse_fraction(x[end])
unit = x[end+1:]
except ValueError:
unit = x[end:]
return amount, unit

def parse_ingredient_with_comma(tokens):
ingredient = ''
note = ''
start = 0
# search for first occurence of an argument ending in a comma
while start < len(tokens) and not tokens[start].endswith(','):
start += 1
if start == len(tokens):
# no token ending in a comma found -> use everything as ingredient
ingredient = ' '.join(tokens)
else:
ingredient = ' '.join(tokens[:start+1])[:-1]
note = ' '.join(tokens[start+1:])
return ingredient, note

def parse_ingredient(tokens):
ingredient = ''
note = ''
if tokens[-1].endswith(')'):
# last argument ends with closing bracket -> look for opening bracket
start = len(tokens) - 1
while not tokens[start].startswith('(') and not start == 0:
start -= 1
if start == 0:
# the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit)
raise ValueError
elif start < 0:
# no opening bracket anywhere -> just ignore the last bracket
ingredient, note = parse_ingredient_with_comma(tokens)
else:
# opening bracket found -> split in ingredient and note, remove brackets from note
note = ' '.join(tokens[start:])[1:-1]
ingredient = ' '.join(tokens[:start])
else:
ingredient, note = parse_ingredient_with_comma(tokens)
return ingredient, note

def parse(x):
# initialize default values
amount = 0
unit = ''
ingredient = ''
note = ''

tokens = x.split()
if len(tokens) == 1:
# there only is one argument, that must be the ingredient
ingredient = tokens[0]
else:
try:
# try to parse first argument as amount
amount, unit = parse_amount(tokens[0])
# only try to parse second argument as amount if there are at least three arguments
# if it already has a unit there can't be a fraction for the amount
if len(tokens) > 2:
try:
if not unit == '':
# a unit is already found, no need to try the second argument for a fraction
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except
raise ValueError
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
amount += parse_fraction(tokens[1])
# assume that units can't end with a comma
if len(tokens) > 3 and not tokens[2].endswith(','):
# try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails
try:
ingredient, note = parse_ingredient(tokens[3:])
unit = tokens[2]
except ValueError:
ingredient, note = parse_ingredient(tokens[2:])
else:
ingredient, note = parse_ingredient(tokens[2:])
except ValueError:
# assume that units can't end with a comma
if not tokens[1].endswith(','):
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails
try:
ingredient, note = parse_ingredient(tokens[2:])
unit = tokens[1]
except ValueError:
ingredient, note = parse_ingredient(tokens[1:])
else:
ingredient, note = parse_ingredient(tokens[1:])
else:
# only two arguments, first one is the amount which means this is the ingredient
ingredient = tokens[1]
except ValueError:
# can't parse first argument as amount -> no unit -> parse everything as ingredient
ingredient, note = parse_ingredient(tokens)
return amount, unit.strip(), ingredient.strip(), note.strip()
40 changes: 7 additions & 33 deletions cookbook/helper/recipe_url_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from django.utils.translation import gettext as _

from cookbook.models import Keyword
from cookbook.helper.ingredient_parser import parse as parse_ingredient


def get_from_html(html_text, url):
Expand Down Expand Up @@ -70,39 +71,12 @@ def find_recipe_json(ld_json, url):
ingredients = []

for x in ld_json['recipeIngredient']:
ingredient_split = x.split()
ingredient = None
amount = 0
unit = ''
if len(ingredient_split) > 2:
ingredient = " ".join(ingredient_split[2:])
unit = ingredient_split[1]

try:
if 'fraction' in unicodedata.decomposition(ingredient_split[0]):
frac_split = unicodedata.decomposition(ingredient_split[0]).split()
amount = round(float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')), 3)
else:
raise TypeError
except TypeError: # raised by unicodedata.decomposition if there was no unicode character in parsed data
try:
amount = float(ingredient_split[0].replace(',', '.'))
except ValueError:
amount = 0
ingredient = " ".join(ingredient_split)
if len(ingredient_split) == 2:
ingredient = " ".join(ingredient_split[1:])
unit = ''
try:
amount = float(ingredient_split[0].replace(',', '.'))
except ValueError:
amount = 0
ingredient = " ".join(ingredient_split)
if len(ingredient_split) == 1:
ingredient = " ".join(ingredient_split)

if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, 'original': x})
try:
amount, unit, ingredient, note = parse_ingredient(x)
if ingredient:
ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, "note": note, 'original': x})
except:
pass

ld_json['recipeIngredient'] = ingredients
else:
Expand Down