Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support augmentation #13

Merged
merged 1 commit into from
Sep 6, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -5,14 +5,14 @@

setuptools.setup(
name='urbamt',
version='0.0.1-b2',
version='0.0.1-b3',
author="Patrick Phat Nguyen",
author_email="[email protected]",
description="Universal Rule-based Machine Translation Toolkit (URBaMT)",
description="URBaMT: Universal Rule-based Machine Translation Toolkit",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/urbamt/urbamt",
packages=setuptools.find_packages(exclude=['docs', 'tests']),
packages=setuptools.find_packages(exclude=['docs', 'tests', 'experiments']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
27 changes: 6 additions & 21 deletions urbamt/translator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, List
from .utils.tree_manipulation import translate_tree_grammar
from .utils.tree_manipulation import translate_trees_grammar
from .utils.misc import remove_trailing_space
import nltk
from nltk.parse.chart import BottomUpLeftCornerChartParser as Parser
@@ -65,27 +65,12 @@ def translate(self, sentences: List[str] or str, allow_multiple_translation = Fa
for sentence in sentences:
sentence = self.__process_text_input(sentence)
trees = self.parser.parse(sentence.split())
list_trees = [tree for tree in trees]

trans_sentence = translate_trees_grammar(list_trees, self.src_to_tgt_grammar, self.src_to_tgt_dictionary)

# Flag to check if there are trees in generator (grammar matched)
translated = False

for t in trees:
translated = True

# Translate grammar
trans_gram_sentence = translate_tree_grammar(t,self.src_to_tgt_grammar)

# Translate words
trans_lang_sentence = ' '.join([self.src_to_tgt_dictionary.get(word,word) for word in trans_gram_sentence.split()])

translated_sentences.append(trans_lang_sentence)

# Get 1 sentence only, will support multi sentence
break

if translated == False:
failed_sentences.append(sentence)

translated_sentences.append(trans_sentence)

# String to display failed sentence
failed_sentences = '\n'.join(failed_sentences)

89 changes: 80 additions & 9 deletions urbamt/utils/tree_manipulation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import nltk
from nltk import ParentedTree as PTree
from typing import List
import random

def tree_to_ptree(tree: nltk.Tree):
tree_str = tree.__str__()
@@ -13,26 +15,37 @@ def get_grammar(tree: nltk.Tree):
grammar += f" {sub.label}"


def swap_tree_given_left(left_tree: nltk.Tree, displacement: list):
def swap_tree_given_left(left_tree: nltk.Tree, displacement: List[int], new_words= List[str]):
"""
swap left node with right node within a parent node
"""

nodes = [left_tree]
right_tree = left_tree.right_sibling()
parent_tree = left_tree.parent()

# Get all tree pointer
for i in range(len(displacement)-1):
for disp in displacement:
# disp = -1 indicates that is a new word, skip
if disp == -1:
continue
nodes.append(right_tree)

right_tree = right_tree.right_sibling()
if right_tree == None:
break

# Remove all siblings and left-most self
for node in nodes:
parent_tree.remove(node)

# Append with new displacement
for disp in displacement:
parent_tree.append(nodes[disp])
# disp = -1 indicates that is a new word
if disp == -1:
new_word = PTree('NEW', [new_words.pop(0)])
parent_tree.append(new_word)
else:
parent_tree.append(nodes[disp])

return parent_tree

@@ -60,24 +73,82 @@ def build_grammar_str_from_left_most(tree: nltk.Tree):


def translate_tree_grammar(tree: nltk.Tree, grammar_substitutions: dict):

# Number of substitution done
num_subs = 0
# Convert tree to ParentedTree
ptree = tree_to_ptree(tree)

# Traverse through subtrees
for sub in ptree.subtrees():
# Create grammar string from left-most node. E.g: NP -> JJ NP,
# in this case, JJ is left-most node
grammar_str = build_grammar_str_from_left_most(sub)
for src_grammar, tgt_grammar in grammar_substitutions.items():
if grammar_str == src_grammar:
disp = calculate_displacement(src_grammar,tgt_grammar)
swap_tree_given_left(sub,disp)
num_subs += 1

# Calculate displacement between 2 grammar strings
disp, new_words = calculate_displacement(src_grammar,tgt_grammar)

# Change tree nodes positions thanks to new displacement
swap_tree_given_left(sub, disp, new_words)

translated_grammar_sentence = " ".join(ptree.leaves())
return translated_grammar_sentence
return translated_grammar_sentence, num_subs
def translate_sentence_words(sentence, src_to_tgt_dictionary):
words_list = []

for word in sentence.split():
target_word = src_to_tgt_dictionary.get(word,word)

if isinstance(target_word, list):
target_word = random.choice(target_word)

words_list.append(target_word)

return ' '.join(words_list)

def translate_trees_grammar(list_trees: List[nltk.Tree], src_to_tgt_grammar, src_to_tgt_dictionary):

# Flag to check if there are trees in generator (grammar matched)
translated = False

# translated sentence map with number of grammar substitution found
trans_map = {}

for tree in list_trees:
translated = True

# Translate grammar
trans_gram_sentence, num_subs = translate_tree_grammar(tree, src_to_tgt_grammar)

# Translate words
trans_lang_sentence = translate_sentence_words(trans_gram_sentence, src_to_tgt_dictionary)

# Append to trans map
trans_map[trans_lang_sentence] = num_subs

# Return translation that has the most displacement
return max(trans_map, key=trans_map.get)

def calculate_displacement(src_grammar, tgt_grammar):
src_grammar_lst = src_grammar.split()
tgt_grammar_lst = tgt_grammar.split()

src_grammar_lst = src_grammar_lst[src_grammar_lst.index("->")+1:]
tgt_grammar_lst = tgt_grammar_lst[tgt_grammar_lst.index("->")+1:]

displacement = []
new_words = []

for word in tgt_grammar_lst:
displacement.append(src_grammar_lst.index(word))
return displacement
try:
displacement.append(src_grammar_lst.index(word))
except ValueError:
# Resolve ValueError: substring not found
# Which indicates this is a new word
displacement.append(-1)
new_words.append(word)

return displacement, new_words