From 94e1fdfbff91fa39dba2d8a2108d12528d2c0163 Mon Sep 17 00:00:00 2001 From: Aaron <42084688+l0c4lh057@users.noreply.github.com> Date: Mon, 21 Dec 2020 20:00:46 +0100 Subject: [PATCH 1/3] Improve text to ingredient parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous implementation of parsing ingredients was very simple. I now wrote a parser that I would consider good. It takes care of several edge cases and notations. - Supports fraction unicode (½, ¼, ⅜, ...) - Supports notations like `1½` and `1 1/2` - Supports unit directly after the amount without space inbetween (`2g`, `2½g`) - Supports notes (`5g onion (cubed)` -> amount: 5, unit: g, ingredient: onion, note: cubed) - Supports notes (`5g onion, cubed` -> amount: 5, unit: g, ingredient: onion, note: cubed) - Does not break when both commas and brackets exist --- cookbook/helper/ingredient_parser.py | 131 +++++++++++++++++++++++++++ cookbook/helper/recipe_url_import.py | 40 ++------ 2 files changed, 138 insertions(+), 33 deletions(-) create mode 100644 cookbook/helper/ingredient_parser.py diff --git a/cookbook/helper/ingredient_parser.py b/cookbook/helper/ingredient_parser.py new file mode 100644 index 0000000000..2e642c39e9 --- /dev/null +++ b/cookbook/helper/ingredient_parser.py @@ -0,0 +1,131 @@ +import unicodedata +import string + +def parse_fraction(x): + if len(x) == 1 and "fraction" in unicodedata.decomposition(x): + frac_split = unicodedata.decomposition(x[-1:]).split() + return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", "")) + else: + frac_split = x.split("/") + if not len(frac_split) == 2: + raise ValueError + try: + return int(frac_split[0]) / int(frac_split[1]) + except ZeroDivisionError: + raise ValueError + +def parse_amount(x): + amount = 0 + unit = "" + + did_check_frac = False + end = 0 + while end < len(x) and (x[end] in string.digits or ((x[end] == "." or x[end] == ",") and end + 1 < len(x) and x[end+1] in string.digits)): + end += 1 + if end > 0: + amount = float(x[:end].replace(",", ".")) + else: + amount = parse_fraction(x[0]) + end += 1 + did_check_frac = True + if end < len(x): + if did_check_frac: + unit = x[end:] + else: + try: + amount += parse_fraction(x[end]) + unit = x[end+1:] + except ValueError: + unit = x[end:] + return amount, unit + +def parse_ingredient_with_comma(tokens): + ingredient = "" + note = "" + start = 0 + # search for first occurence of an argument ending in a comma + while start < len(tokens) and not tokens[start].endswith(","): + start += 1 + if start == len(tokens): + # no token ending in a comma found -> use everything as ingredient + ingredient = " ".join(tokens) + else: + ingredient = " ".join(tokens[:start+1])[:-1] + note = " ".join(tokens[start+1:]) + return ingredient, note + +def parse_ingredient(tokens): + ingredient = "" + note = "" + if tokens[-1].endswith(")"): + # last argument ends with closing bracket -> look for opening bracket + start = len(tokens) - 1 + while not tokens[start].startswith("(") and not start == 0: + start -= 1 + if start == 0: + # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) + raise ValueError + elif start < 0: + # no opening bracket anywhere -> just ignore the last bracket + ingredient, note = parse_ingredient_with_comma(tokens) + else: + # opening bracket found -> split in ingredient and note, remove brackets from note + note = " ".join(tokens[start:])[1:-1] + ingredient = " ".join(tokens[:start]) + else: + ingredient, note = parse_ingredient_with_comma(tokens) + return ingredient, note + +def parse(x): + # initialize default values + amount = 0 + unit = "" + ingredient = "" + note = "" + + tokens = x.split() + if len(tokens) == 1: + # there only is one argument, that must be the ingredient + ingredient = tokens[0] + else: + try: + # try to parse first argument as amount + amount, unit = parse_amount(tokens[0]) + # only try to parse second argument as amount if there are at least three arguments + # if it already has a unit there can't be a fraction for the amount + if len(tokens) > 2: + try: + if not unit == "": + # a unit is already found, no need to try the second argument for a fraction + # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except + raise ValueError + # try to parse second argument as amount and add that, in case of "2 1/2" or "2 ½" + amount += parse_fraction(tokens[1]) + # assume that units can't end with a comma + if len(tokens) > 3 and not tokens[2].endswith(","): + # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[3:]) + unit = tokens[2] + except ValueError: + ingredient, note = parse_ingredient(tokens[2:]) + else: + ingredient, note = parse_ingredient(tokens[2:]) + except ValueError: + # assume that units can't end with a comma + if not tokens[1].endswith(","): + # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[2:]) + unit = tokens[1] + except ValueError: + ingredient, note = parse_ingredient(tokens[1:]) + else: + ingredient, note = parse_ingredient(tokens[1:]) + else: + # only two arguments, first one is the amount which means this is the ingredient + ingredient = tokens[1] + except ValueError: + # can't parse first argument as amount -> no unit -> parse everything as ingredient + ingredient, note = parse_ingredient(tokens) + return amount, unit.strip(), ingredient.strip(), note.strip() diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index fe2d845b3b..bdb11b583a 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -11,6 +11,7 @@ from django.utils.translation import gettext as _ from cookbook.models import Keyword +from cookbook.helper.ingredient_parser import parse as parse_ingredient def get_from_html(html_text, url): @@ -70,39 +71,12 @@ def find_recipe_json(ld_json, url): ingredients = [] for x in ld_json['recipeIngredient']: - ingredient_split = x.split() - ingredient = None - amount = 0 - unit = '' - if len(ingredient_split) > 2: - ingredient = " ".join(ingredient_split[2:]) - unit = ingredient_split[1] - - try: - if 'fraction' in unicodedata.decomposition(ingredient_split[0]): - frac_split = unicodedata.decomposition(ingredient_split[0]).split() - amount = round(float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')), 3) - else: - raise TypeError - except TypeError: # raised by unicodedata.decomposition if there was no unicode character in parsed data - try: - amount = float(ingredient_split[0].replace(',', '.')) - except ValueError: - amount = 0 - ingredient = " ".join(ingredient_split) - if len(ingredient_split) == 2: - ingredient = " ".join(ingredient_split[1:]) - unit = '' - try: - amount = float(ingredient_split[0].replace(',', '.')) - except ValueError: - amount = 0 - ingredient = " ".join(ingredient_split) - if len(ingredient_split) == 1: - ingredient = " ".join(ingredient_split) - - if ingredient: - ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, 'original': x}) + try: + amount, unit, ingredient, note = parse_ingredient(x) + if ingredient: + ingredients.append({'amount': amount, 'unit': {'text': unit, 'id': random.randrange(10000, 99999)}, 'ingredient': {'text': ingredient, 'id': random.randrange(10000, 99999)}, "note": note, 'original': x}) + except: + pass ld_json['recipeIngredient'] = ingredients else: From 5e07c6130fcf4ece38de4f487d979ab938caa858 Mon Sep 17 00:00:00 2001 From: Aaron <42084688+l0c4lh057@users.noreply.github.com> Date: Mon, 21 Dec 2020 20:14:32 +0100 Subject: [PATCH 2/3] Switch to 4-space indentation --- cookbook/helper/ingredient_parser.py | 238 +++++++++++++-------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/cookbook/helper/ingredient_parser.py b/cookbook/helper/ingredient_parser.py index 2e642c39e9..bb075c3c03 100644 --- a/cookbook/helper/ingredient_parser.py +++ b/cookbook/helper/ingredient_parser.py @@ -2,130 +2,130 @@ import string def parse_fraction(x): - if len(x) == 1 and "fraction" in unicodedata.decomposition(x): - frac_split = unicodedata.decomposition(x[-1:]).split() - return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", "")) - else: - frac_split = x.split("/") - if not len(frac_split) == 2: - raise ValueError - try: - return int(frac_split[0]) / int(frac_split[1]) - except ZeroDivisionError: - raise ValueError + if len(x) == 1 and "fraction" in unicodedata.decomposition(x): + frac_split = unicodedata.decomposition(x[-1:]).split() + return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", "")) + else: + frac_split = x.split("/") + if not len(frac_split) == 2: + raise ValueError + try: + return int(frac_split[0]) / int(frac_split[1]) + except ZeroDivisionError: + raise ValueError def parse_amount(x): - amount = 0 - unit = "" - - did_check_frac = False - end = 0 - while end < len(x) and (x[end] in string.digits or ((x[end] == "." or x[end] == ",") and end + 1 < len(x) and x[end+1] in string.digits)): - end += 1 - if end > 0: - amount = float(x[:end].replace(",", ".")) - else: - amount = parse_fraction(x[0]) - end += 1 - did_check_frac = True - if end < len(x): - if did_check_frac: - unit = x[end:] - else: - try: - amount += parse_fraction(x[end]) - unit = x[end+1:] - except ValueError: - unit = x[end:] - return amount, unit + amount = 0 + unit = "" + + did_check_frac = False + end = 0 + while end < len(x) and (x[end] in string.digits or ((x[end] == "." or x[end] == ",") and end + 1 < len(x) and x[end+1] in string.digits)): + end += 1 + if end > 0: + amount = float(x[:end].replace(",", ".")) + else: + amount = parse_fraction(x[0]) + end += 1 + did_check_frac = True + if end < len(x): + if did_check_frac: + unit = x[end:] + else: + try: + amount += parse_fraction(x[end]) + unit = x[end+1:] + except ValueError: + unit = x[end:] + return amount, unit def parse_ingredient_with_comma(tokens): - ingredient = "" - note = "" - start = 0 - # search for first occurence of an argument ending in a comma - while start < len(tokens) and not tokens[start].endswith(","): - start += 1 - if start == len(tokens): - # no token ending in a comma found -> use everything as ingredient - ingredient = " ".join(tokens) - else: - ingredient = " ".join(tokens[:start+1])[:-1] - note = " ".join(tokens[start+1:]) - return ingredient, note + ingredient = "" + note = "" + start = 0 + # search for first occurence of an argument ending in a comma + while start < len(tokens) and not tokens[start].endswith(","): + start += 1 + if start == len(tokens): + # no token ending in a comma found -> use everything as ingredient + ingredient = " ".join(tokens) + else: + ingredient = " ".join(tokens[:start+1])[:-1] + note = " ".join(tokens[start+1:]) + return ingredient, note def parse_ingredient(tokens): - ingredient = "" - note = "" - if tokens[-1].endswith(")"): - # last argument ends with closing bracket -> look for opening bracket - start = len(tokens) - 1 - while not tokens[start].startswith("(") and not start == 0: - start -= 1 - if start == 0: - # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) - raise ValueError - elif start < 0: - # no opening bracket anywhere -> just ignore the last bracket - ingredient, note = parse_ingredient_with_comma(tokens) - else: - # opening bracket found -> split in ingredient and note, remove brackets from note - note = " ".join(tokens[start:])[1:-1] - ingredient = " ".join(tokens[:start]) - else: - ingredient, note = parse_ingredient_with_comma(tokens) - return ingredient, note + ingredient = "" + note = "" + if tokens[-1].endswith(")"): + # last argument ends with closing bracket -> look for opening bracket + start = len(tokens) - 1 + while not tokens[start].startswith("(") and not start == 0: + start -= 1 + if start == 0: + # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) + raise ValueError + elif start < 0: + # no opening bracket anywhere -> just ignore the last bracket + ingredient, note = parse_ingredient_with_comma(tokens) + else: + # opening bracket found -> split in ingredient and note, remove brackets from note + note = " ".join(tokens[start:])[1:-1] + ingredient = " ".join(tokens[:start]) + else: + ingredient, note = parse_ingredient_with_comma(tokens) + return ingredient, note def parse(x): - # initialize default values - amount = 0 - unit = "" - ingredient = "" - note = "" - - tokens = x.split() - if len(tokens) == 1: - # there only is one argument, that must be the ingredient - ingredient = tokens[0] - else: - try: - # try to parse first argument as amount - amount, unit = parse_amount(tokens[0]) - # only try to parse second argument as amount if there are at least three arguments - # if it already has a unit there can't be a fraction for the amount - if len(tokens) > 2: - try: - if not unit == "": - # a unit is already found, no need to try the second argument for a fraction - # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except - raise ValueError - # try to parse second argument as amount and add that, in case of "2 1/2" or "2 ½" - amount += parse_fraction(tokens[1]) - # assume that units can't end with a comma - if len(tokens) > 3 and not tokens[2].endswith(","): - # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails - try: - ingredient, note = parse_ingredient(tokens[3:]) - unit = tokens[2] - except ValueError: - ingredient, note = parse_ingredient(tokens[2:]) - else: - ingredient, note = parse_ingredient(tokens[2:]) - except ValueError: - # assume that units can't end with a comma - if not tokens[1].endswith(","): - # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails - try: - ingredient, note = parse_ingredient(tokens[2:]) - unit = tokens[1] - except ValueError: - ingredient, note = parse_ingredient(tokens[1:]) - else: - ingredient, note = parse_ingredient(tokens[1:]) - else: - # only two arguments, first one is the amount which means this is the ingredient - ingredient = tokens[1] - except ValueError: - # can't parse first argument as amount -> no unit -> parse everything as ingredient - ingredient, note = parse_ingredient(tokens) - return amount, unit.strip(), ingredient.strip(), note.strip() + # initialize default values + amount = 0 + unit = "" + ingredient = "" + note = "" + + tokens = x.split() + if len(tokens) == 1: + # there only is one argument, that must be the ingredient + ingredient = tokens[0] + else: + try: + # try to parse first argument as amount + amount, unit = parse_amount(tokens[0]) + # only try to parse second argument as amount if there are at least three arguments + # if it already has a unit there can't be a fraction for the amount + if len(tokens) > 2: + try: + if not unit == "": + # a unit is already found, no need to try the second argument for a fraction + # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except + raise ValueError + # try to parse second argument as amount and add that, in case of "2 1/2" or "2 ½" + amount += parse_fraction(tokens[1]) + # assume that units can't end with a comma + if len(tokens) > 3 and not tokens[2].endswith(","): + # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[3:]) + unit = tokens[2] + except ValueError: + ingredient, note = parse_ingredient(tokens[2:]) + else: + ingredient, note = parse_ingredient(tokens[2:]) + except ValueError: + # assume that units can't end with a comma + if not tokens[1].endswith(","): + # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails + try: + ingredient, note = parse_ingredient(tokens[2:]) + unit = tokens[1] + except ValueError: + ingredient, note = parse_ingredient(tokens[1:]) + else: + ingredient, note = parse_ingredient(tokens[1:]) + else: + # only two arguments, first one is the amount which means this is the ingredient + ingredient = tokens[1] + except ValueError: + # can't parse first argument as amount -> no unit -> parse everything as ingredient + ingredient, note = parse_ingredient(tokens) + return amount, unit.strip(), ingredient.strip(), note.strip() From 79396cec9e3047427891346859712c4158208342 Mon Sep 17 00:00:00 2001 From: Aaron <42084688+l0c4lh057@users.noreply.github.com> Date: Mon, 21 Dec 2020 22:42:27 +0100 Subject: [PATCH 3/3] switch from double to single quotes --- cookbook/helper/ingredient_parser.py | 50 ++++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/cookbook/helper/ingredient_parser.py b/cookbook/helper/ingredient_parser.py index bb075c3c03..1f1b510a9d 100644 --- a/cookbook/helper/ingredient_parser.py +++ b/cookbook/helper/ingredient_parser.py @@ -2,11 +2,11 @@ import string def parse_fraction(x): - if len(x) == 1 and "fraction" in unicodedata.decomposition(x): + if len(x) == 1 and 'fraction' in unicodedata.decomposition(x): frac_split = unicodedata.decomposition(x[-1:]).split() - return float((frac_split[1]).replace("003", "")) / float((frac_split[3]).replace("003", "")) + return float((frac_split[1]).replace('003', '')) / float((frac_split[3]).replace('003', '')) else: - frac_split = x.split("/") + frac_split = x.split('/') if not len(frac_split) == 2: raise ValueError try: @@ -16,14 +16,14 @@ def parse_fraction(x): def parse_amount(x): amount = 0 - unit = "" + unit = '' did_check_frac = False end = 0 - while end < len(x) and (x[end] in string.digits or ((x[end] == "." or x[end] == ",") and end + 1 < len(x) and x[end+1] in string.digits)): + while end < len(x) and (x[end] in string.digits or ((x[end] == '.' or x[end] == ',') and end + 1 < len(x) and x[end+1] in string.digits)): end += 1 if end > 0: - amount = float(x[:end].replace(",", ".")) + amount = float(x[:end].replace(',', '.')) else: amount = parse_fraction(x[0]) end += 1 @@ -40,27 +40,27 @@ def parse_amount(x): return amount, unit def parse_ingredient_with_comma(tokens): - ingredient = "" - note = "" + ingredient = '' + note = '' start = 0 # search for first occurence of an argument ending in a comma - while start < len(tokens) and not tokens[start].endswith(","): + while start < len(tokens) and not tokens[start].endswith(','): start += 1 if start == len(tokens): # no token ending in a comma found -> use everything as ingredient - ingredient = " ".join(tokens) + ingredient = ' '.join(tokens) else: - ingredient = " ".join(tokens[:start+1])[:-1] - note = " ".join(tokens[start+1:]) + ingredient = ' '.join(tokens[:start+1])[:-1] + note = ' '.join(tokens[start+1:]) return ingredient, note def parse_ingredient(tokens): - ingredient = "" - note = "" - if tokens[-1].endswith(")"): + ingredient = '' + note = '' + if tokens[-1].endswith(')'): # last argument ends with closing bracket -> look for opening bracket start = len(tokens) - 1 - while not tokens[start].startswith("(") and not start == 0: + while not tokens[start].startswith('(') and not start == 0: start -= 1 if start == 0: # the whole list is wrapped in brackets -> assume it is an error (e.g. assumed first argument was the unit) @@ -70,8 +70,8 @@ def parse_ingredient(tokens): ingredient, note = parse_ingredient_with_comma(tokens) else: # opening bracket found -> split in ingredient and note, remove brackets from note - note = " ".join(tokens[start:])[1:-1] - ingredient = " ".join(tokens[:start]) + note = ' '.join(tokens[start:])[1:-1] + ingredient = ' '.join(tokens[:start]) else: ingredient, note = parse_ingredient_with_comma(tokens) return ingredient, note @@ -79,9 +79,9 @@ def parse_ingredient(tokens): def parse(x): # initialize default values amount = 0 - unit = "" - ingredient = "" - note = "" + unit = '' + ingredient = '' + note = '' tokens = x.split() if len(tokens) == 1: @@ -95,14 +95,14 @@ def parse(x): # if it already has a unit there can't be a fraction for the amount if len(tokens) > 2: try: - if not unit == "": + if not unit == '': # a unit is already found, no need to try the second argument for a fraction # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except raise ValueError - # try to parse second argument as amount and add that, in case of "2 1/2" or "2 ½" + # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½' amount += parse_fraction(tokens[1]) # assume that units can't end with a comma - if len(tokens) > 3 and not tokens[2].endswith(","): + if len(tokens) > 3 and not tokens[2].endswith(','): # try to use third argument as unit and everything else as ingredient, use everything as ingredient if it fails try: ingredient, note = parse_ingredient(tokens[3:]) @@ -113,7 +113,7 @@ def parse(x): ingredient, note = parse_ingredient(tokens[2:]) except ValueError: # assume that units can't end with a comma - if not tokens[1].endswith(","): + if not tokens[1].endswith(','): # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails try: ingredient, note = parse_ingredient(tokens[2:])