-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizing_bpe_apply.sh
36 lines (24 loc) · 1.49 KB
/
tokenizing_bpe_apply.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
moses_scripts=./mosesdecoder/scripts # file path
bpe_scripts=./subword-nmt # file path
domain=$1
bpe_operations=20000 # 32000
S=de
T=en
data_dir=original_dataset
dest_dir=preprocess_dataset
split=$2 # valid, test
## 0. Remove unneccessary punctuation
perl $moses_scripts/tokenizer/normalize-punctuation.perl < $data_dir/${domain}-${split}.$S > $dest_dir/${domain}-${split}.$S
perl $moses_scripts/tokenizer/normalize-punctuation.perl < $data_dir/${domain}-${split}.$T > $dest_dir/${domain}-${split}.$T
### 1. Tokenizing ###
perl $moses_scripts/tokenizer/tokenizer.perl -threads 50 -l $S < $dest_dir/${domain}-${split}.$S > $dest_dir/${domain}-${split}.tok.$S
perl $moses_scripts/tokenizer/tokenizer.perl -threads 50 -l $T < $dest_dir/${domain}-${split}.$T > $dest_dir/${domain}-${split}.tok.$T
### 2. Truecaser ###
## Apply ##
perl $moses_scripts/recaser/truecase.perl -model $dest_dir/${domain}-truecase-model.$S < $dest_dir/${domain}-${split}.tok.$S > $dest_dir/${domain}-${split}.tc.$S
perl $moses_scripts/recaser/truecase.perl -model $dest_dir/${domain}-truecase-model.$T < $dest_dir/${domain}-${split}.tok.$T > $dest_dir/${domain}-${split}.tc.$T
### 3. Apply bpe
## Apply ##
python3 $bpe_scripts/apply_bpe.py -c $dest_dir/$domain-${S}${T}.bpe < $dest_dir/${domain}-${split}.tc.$S > $dest_dir/${domain}-${split}.bpe.$S
python3 $bpe_scripts/apply_bpe.py -c $dest_dir/$domain-${S}${T}.bpe < $dest_dir/${domain}-${split}.tc.$T > $dest_dir/${domain}-${split}.bpe.$T
# bash tokenizing_bpe_apply.sh acquis valid