diff --git a/cookbook/helper/ingredient_parser.py b/cookbook/helper/ingredient_parser.py new file mode 100644 index 0000000000..1f1b510a9d --- /dev/null +++ b/cookbook/helper/ingredient_parser.py @@ -0,0 +1,131 @@ +import unicodedata +import string + +def parse_fraction(x): + if len(x) == 1 and 'fraction' in unicodedata.decomposition(x): + frac_split = unicodedata.decomposition(x[-1:]).split() + return float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')) + else: + frac_split = x.split('/') + if not len(frac_split) == 2: + raise ValueError + try: + return int(frac_split[0]) / int(frac_split[1]) + except ZeroDivisionError: + raise ValueError + +def parse_amount(x): + amount = 0 + unit = '' + + did_check_frac = False + end = 0 + while end < len(x) and (x[end] in string.digits or ((x[end] == '.' or x[end] == ',') and end + 1 < len(x) and x[end+1] in string.digits)): + end += 1 + if end > 0: + amount = float(x[:end].replace(',', '.')) + else: + amount = parse_fraction(x[0]) + end += 1 + did_check_frac = True + if end < len(x): + if did_check_frac: + unit = x[end:] + else: + try: + amount += parse_fraction(x[end]) + unit = x[end+1:] + except ValueError: + unit = x[end:] + return amount, unit + +def parse_ingredient_with_comma(tokens): + ingredient = '' + note = '' + start = 0 + # search for first occurence of an argument ending in a comma + while start < len(tokens) and not tokens[start].endswith(','): + start += 1 + if start == len(tokens): + # no token ending in a comma found -> use everything as ingredient + ingredient = ' '.join(tokens) + else: + ingredient = ' '.join(tokens[:start+1])[:-1] + note = ' '.join(tokens[start+1:]) + return ingredient, note + +def parse_ingredient(tokens): + ingredient = '' + note = '' + if tokens[-1].endswith(')'): + # last argument ends with closing bracket -> look for opening bracket + start = len(tokens) - 1 + while not tokens[start].startswith('(') and not start == 0: + start -= 1 + if start == 0: + # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) + raise ValueError + elif start < 0: + # no opening bracket anywhere -> just ignore the last bracket + ingredient, note = parse_ingredient_with_comma(tokens) + else: + # opening bracket found -> split in ingredient and note, remove brackets from note + note = ' '.join(tokens[start:])[1:-1] + ingredient = ' '.join(tokens[:start]) + else: + ingredient, note = parse_ingredient_with_comma(tokens) + return ingredient, note + +def parse(x): + # initialize default values + amount = 0 + unit = '' + ingredient = '' + note = '' + + tokens = x.split() + if len(tokens) == 1: + # there only is one argument, that must be the ingredient + ingredient = tokens[0] + else: + try: + # try to parse first argument as amount + amount, unit = parse_amount(tokens[0]) + # only try to parse second argument as amount if there are at least three arguments + # if it already has a unit there can't be a fraction for the amount + if len(tokens) > 2: + try: + if not unit == '': + # a unit is already found, no need to try the second argument for a fraction + # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except + raise ValueError + # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½' + amount += parse_fraction(tokens[1]) + # assume that units can't end with a comma + if len(tokens) > 3 and not tokens[2].endswith(','): + # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[3:]) + unit = tokens[2] + except ValueError: + ingredient, note = parse_ingredient(tokens[2:]) + else: + ingredient, note = parse_ingredient(tokens[2:]) + except ValueError: + # assume that units can't end with a comma + if not tokens[1].endswith(','): + # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[2:]) + unit = tokens[1] + except ValueError: + ingredient, note = parse_ingredient(tokens[1:]) + else: + ingredient, note = parse_ingredient(tokens[1:]) + else: + # only two arguments, first one is the amount which means this is the ingredient + ingredient = tokens[1] + except ValueError: + # can't parse first argument as amount -> no unit -> parse everything as ingredient + ingredient, note = parse_ingredient(tokens) + return amount, unit.strip(), ingredient.strip(), note.strip() diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index fe2d845b3b..bdb11b583a 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -11,6 +11,7 @@ from django.utils.translation import gettext as _ from cookbook.models import Keyword +from cookbook.helper.ingredient_parser import parse as parse_ingredient def get_from_html(html_text, url): @@ -70,39 +71,12 @@ def find_recipe_json(ld_json, url): ingredients = [] for x in ld_json['recipeIngredient']: - ingredient_split = x.split() - ingredient = None - amount = 0 - unit = '' - if len(ingredient_split) > 2: - ingredient = " ".join(ingredient_split[2:]) - unit = ingredient_split[1] - - try: - if 'fraction' in unicodedata.decomposition(ingredient_split[0]): - frac_split = unicodedata.decomposition(ingredient_split[0]).split() - amount = round(float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')), 3) - else: - raise TypeError - except TypeError: # raised by unicodedata.decomposition if there was no unicode character in parsed data - try: - amount = float(ingredient_split[0].replace(',', '.')) - except ValueError: - amount = 0 - ingredient = " ".join(ingredient_split) - if len(ingredient_split) == 2: - ingredient = " ".join(ingredient_split[1:]) - unit = '' - try: - amount = float(ingredient_split[0].replace(',', '.')) - except ValueError: - amount = 0 - ingredient = " ".join(ingredient_split) - if len(ingredient_split) == 1: - ingredient = " ".join(ingredient_split) - - if ingredient: - ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, 'original': x}) + try: + amount, unit, ingredient, note = parse_ingredient(x) + if ingredient: + ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, "note": note, 'original': x}) + except: + pass ld_json['recipeIngredient'] = ingredients else: