hplt-project · XapaJIaMnu · Dec 12, 2022 · Sep 8, 2022 · Sep 8, 2022 · Sep 9, 2022
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ coverage
 *.sw?
 
 /data
+/trainer/TMP
 *__pycache__*
 .env
 placeholders/mappings.yml
diff --git a/placeholders/placeholders.py b/placeholders/placeholders.py
@@ -15,6 +15,8 @@
 parser.add_argument('-c', '--config', type=str, help='Path to yaml configuration file, required for encoding')
 parser.add_argument('-m', '--mappings_file', type=str, default="mappings.yml", help='Path to the mappings, one yaml entry per line.')
 parser.add_argument('-s', '--seed', type=int, default=None, help='Seed for random number generator.')
+parser.add_argument('-n', '--no-mapping', action="store_true", help='Do not dump a mapping file for decoding. Useful for training')
+parser.add_argument('-t', '--strict', action="store_true", help="Only generate a placeholder if there's equal number on the source and target side of each (assumes TSV input).")
 mutex_group_1 = parser.add_mutually_exclusive_group(required=True)
 mutex_group_1.add_argument('--decode', action='store_true')
 mutex_group_1.add_argument('--encode', action='store_true')
@@ -28,15 +30,15 @@ class Rule:
 @dataclass
 class Configuration:
     """Object holding the yaml config"""
-    def __init__(self, config_file):
+    def __init__(self, config_file, dump_placeholders: bool):
         with open(config_file, 'r') as config_handle:
             my_config = yaml.safe_load(config_handle)
 
         # Parse
         self.rules = [Rule(regex) for regex in my_config['regexes']]
         self.placeholder_symbol = my_config.get('placeholder-symbol', '@')
         self.num_placeholders = my_config.get('num-placeholders', 20)
-        self.placeholders = [self.placeholder_symbol + str(i) for i in range(self.num_placeholders)]
+        self.placeholders = [self.placeholder_symbol[:-1] + str(i) + self.placeholder_symbol[-1] for i in range(self.num_placeholders)]
 
         # Add a rule that escapes patterns that look like a placeholder already
         # TODO: this will match placeholders we can't reach because `num_placeholders` might be smaller
@@ -47,7 +49,7 @@ def __init__(self, config_file):
         self.rules.append(Rule(pattern=re.escape(self.placeholder_symbol) + r'\d+'))
 
         # During encoding assert that we have vocab
-        if 'vocab' in my_config:
+        if not dump_placeholders and 'vocab' in my_config:
             vocab = my_config['vocab']
             self.sp = SentencePieceProcessor(vocab)
 
@@ -64,12 +66,13 @@ def __init__(self, config_file):
 
 class Encoder:
     '''Encodes spm strings'''
-    def __init__(self, placeholders: List[str], spm_vocab: SentencePieceProcessor, rules: List[Rule], *, random: Random = Random()):
+    def __init__(self, placeholders: List[str], spm_vocab: SentencePieceProcessor, rules: List[Rule], strict: bool, *, random: Random = Random()):
         self.placeholders = placeholders
         self.sp = spm_vocab
         self.rules = rules
         self.unk_id  = self.sp.unk_id()
         self.random = random
+        self.strict = strict # Use strict mode, only making replacements when the same amount of tokens are on the source and the target side
 
         # Compile rules into one mega-pattern
         self.rule_pattern = re.compile('|'.join('(?:{})'.format(rule.pattern) for rule in self.rules))
@@ -112,23 +115,37 @@ def replace_one(token) -> str:
             inputline += token
         inputline += '\n'
 
+        # Check if strict rules apply
+        if self.strict:
+            src, trg = inputline.split('\t')
+            for mytoken, myreplacement in replacements.items():
+                if src.count(myreplacement) != trg.count(myreplacement):
+                    # We have a mismatch placeholder on source and target
+                    inputline = inputline.replace(myreplacement, mytoken)
+
         return (inputline, dict((v, k) for k, v in replacements.items()))
 
 
-def encode(my_placeholders: List[str], my_sp: SentencePieceProcessor, my_rules: List[Rule], *, random: Random) -> None:
+def encode(my_placeholders: List[str], my_sp: SentencePieceProcessor, my_rules: List[Rule], strict: bool, *, random: Random, no_mapping: bool) -> None:
     '''Encodes everything form stdin, dumping it to stdout and dumping a file with
        all replacements'''
-    encoder = Encoder(my_placeholders, my_sp, my_rules, random=random)
-    with open(args.mappings_file, 'w') as yamlout:
-        for counter, line in enumerate(sys.stdin):
-
-            encoded_line, mappings = encoder.make_placeholders(line)
+    encoder = Encoder(my_placeholders, my_sp, my_rules, strict, random=random)
+    if no_mapping: # Do not produce any mappings as we are going to just use it during training
+        for line in sys.stdin:
+            encoded_line, _ = encoder.make_placeholders(line)
             sys.stdout.write(encoded_line) # Write the encoded line to stdout
+    else:
+        with open(args.mappings_file, 'w') as yamlout:
+            for counter, line in enumerate(sys.stdin):
+
+                encoded_line, mappings = encoder.make_placeholders(line)
+                sys.stdout.write(encoded_line) # Write the encoded line to stdout
+
+                # Keep track of which sentence has what replacement mappings via a yaml config
+                sent_mapping = {counter: mappings}
+                yaml.dump(sent_mapping, yamlout, allow_unicode=True)
+                yamlout.flush()
 
-            # Keep track of which sentence has what replacement mappings via a yaml config
-            sent_mapping = {counter: mappings}
-            yaml.dump(sent_mapping, yamlout, allow_unicode=True)
-            yamlout.flush()
 
 
 def decode() -> None:
@@ -154,13 +171,13 @@ def decode() -> None:
     random = Random(args.seed)
 
     if args.encode or args.dump_placeholders:
-        config = Configuration(args.config)
+        config = Configuration(args.config, args.dump_placeholders)
 
     if args.dump_placeholders:
         print(" ".join(config.placeholders))
         sys.exit(0)
     elif args.encode:
-        encode(config.placeholders, config.sp, config.rules, random=random)
+        encode(config.placeholders, config.sp, config.rules, args.strict, random=random, no_mapping=args.no_mapping)
     else:
         decode()
 
diff --git a/placeholders/static/test_encode_input_strict b/placeholders/static/test_encode_input_strict
@@ -0,0 +1,2 @@
+This should be a placeholder 😀. It will appear in the diff.	This should be a placeholder 😀. It will appear in the diff.
+This should get ignored 😀	Since no emoji on the target side
diff --git a/placeholders/test.sh b/placeholders/test.sh
@@ -3,3 +3,14 @@ set -euo pipefail
 ./placeholders.py --seed 1 -c static/config.yml -m mappings.yml --encode < static/test_encode_input > /tmp/test
 ./placeholders.py -m mappings.yml < /tmp/test --decode > /tmp/decoded
 diff static/test_encode_input /tmp/decoded
+
+# Test strict mode
+./placeholders.py -c static/config.yml --encode --no-mapping --strict < static/test_encode_input_strict > /tmp/strict_diff
+# Split lines and test First line should be the same
+head -n1 static/test_encode_input_strict > /tmp/original_1
+head -n1 /tmp/strict_diff > /tmp/strict_diff_1
+diff /tmp/original_1 /tmp/strict_diff_1
+# Second line should be different
+tail -n1 static/test_encode_input_strict > /tmp/original_2
+tail -n1 /tmp/strict_diff > /tmp/strict_diff_2
+diff /tmp/original_2 /tmp/strict_diff_2
diff --git a/trainer/README.md b/trainer/README.md
@@ -0,0 +1,106 @@
+# Trainer
+The purpose of the trainer is to provide the user with a flexible way of scheduling various sources of input data, as well as augment the training data with tittle casing, all caps, etc. This is particularly useful when you have multiple data sources and you want to pretrain the model first on backtranslated data, gradually add other sources of data, and finally fine tune, all in one go.
+
+Alternatively, this tool is particularly suited to training multilingual models, as it provides an easy way to define the desired mixture of datasets from different language sources.
+
+## Configuration file
+Define your training process via a configuration file. You define the datasets on top, the stages and then for each stage a mixing criteria and a stage termination criteria. An example configuration file is provided below. The path to the `trainer` is a path to any neural network trainer that supports having stdin as training input format.
+```yml
+# Datasets are already TSV files
+datasets:
+  clean: test/data/clean
+  medium: test/data/medium
+  dirty: test/data/dirty
+
+stages:
+  - start
+  - mid
+  - end
+
+start:
+  - clean 0.8
+  - medium 0.2
+  - dirty 0
+  - until clean 2 # Until two epochs of clean
+
+mid:
+  - clean 0.6
+  - medium 0.3
+  - dirty 0.1
+  - until medium 1
+
+end:
+  - clean 0.4
+  - medium 0.3
+  - dirty 0.3
+  - until dirty 5 # use `inf` to mean until forever
+
+modifiers:
+- uppercase 0.05 # Apply uppercase randomly to 0.05% of sentences. Use 0 to disable
+- titlecase 0.05 # Apply titlecase randomly to 0.05% of sentences. Use 0 to disable
+
+seed: 1111
+trainer: /path/to/trainer/run.py
+```
+
+## Usage
+```bash
+% ./trainer.py --help
+usage: trainer.py [-h] --config CONFIG [--temporary-directory TEMPORARY_DIR] [--state STATE_FILE] [--do-not-resume] [--sync] [trainer-command [arguments]]
+
+Feeds marian tsv data for training.
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG, -c CONFIG
+                        YML configuration input.
+  --temporary-directory TEMPORARY_DIR, -t TEMPORARY_DIR
+                        Temporary dir, used for shuffling and tracking state
+  --state STATE_FILE    Path to trainer state file which stores how much of
+                        each dataset has been read. Defaults to ${CONFIG}.state
+  --sync                Do not shuffle in the background
+  --do-not-resume, -d   Do not resume from the previous training state
+```
+Once you fix the paths in the configuration file, `train_config.yml` you can run a test case by doing:
+```bash
+./trainer.py -c train_config.yml
+```
+You can check resulting mixed file in `/tmp/test`. If your neural network trainer doesn't support training from `stdin`, you can use this tool to generate a training dataset and then disable data reordering or shuffling at your trainer implementation, as your training input should be balanced.
+
+At the start of the training all datasets are shuffled. Each time a dataset's end is reached, it is re-shuffled. Shuffling [in the system temp directory](https://docs.python.org/3.11/library/tempfile.html#tempfile.gettempdir) but can be repositioned using `--temporary-directory` or the `TMPDIR` environment variable. By default, the training state is kept in the same place as the configuration file. If training is interrupted, re-running the trainer should resume from where it was (depending on how much your neural network trainer has buffered, that part will be skipped).
+
+## Generating vocabulary and placeholders before training
+To use the placeholder code augment your training data with placeholders before training, look at this example script:
+```bash
+#!/usr/bin/env bash
+# Get the placeholders
+../placeholders/placeholders.py -c train_config_bgen.yml --dump_placeholders > my_placeholders
+# train vocabulary
+spm_train --bos_id=-1 --eos_id=0 --unk_id=1 --user_defined_symbols_file my_placeholders \
+  --model_prefix="test/vocab.bgen" --vocab_size=12000 \
+  --input="/home/dheart/uni_stuff/postdoc/empty-train/trainer/test/data/clean.bgen" \
+  --shuffle_input_sentence=true --character_coverage 1
+
+# Move vocabulary to the new location
+mv test/vocab.bgen.model test/vocab.bgen.spm
+
+# Make all datasets placeholded
+for myfile in test/data/*.bgen; do
+	../placeholders/placeholders.py -n --strict --encode -c train_config_bgen.yml < ${myfile} > ${myfile}.pls
+done
+```
+You need to augment the training configuration with additional placeholder configuration setting:
+```yml
+vocab: /home/dheart/uni_stuff/postdoc/empty-train/trainer/test/vocab.bgen.spm
+placeholder-symbol: "<PLACEHOLDER>"
+num-placeholders: 4
+regexes:
+    - (https?:\/\/www\.\w{1,63}\.\w{1,63}(?:\/\w{0,63}){0,})
+    - (www\.\w{1,63}\.\w{1,63}(?:\/\w{0,63}){0,})
+    - ([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)
+```
+After vocabulary is trained and data is preprocessed, proceed with a normal training run.
+## Future work
+
+- Terminology support (using a dictionary). We should augment the training data with terminology (possibly stemmed on the source side) so that we can use it real world models
+- A one click run training
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This should be a placeholder 😀. It will appear in the diff. This should be a placeholder 😀. It will appear in the diff.
		This should get ignored 😀 Since no emoji on the target side