-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathwiki_103.yaml
79 lines (71 loc) · 1.97 KB
/
wiki_103.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
seed: 42
share_vocab: true
# save_data: data/wikitext-103-raw-v1/run/example
src_vocab_size: 60000
tgt_vocab_size: 60000
# Prevent/allow overwriting existing files in the folder
overwrite: True
# Vocabulary file(s) created/used
src_vocab: data/wikitext-103-raw-v1/run/example.vocab.src
# tensorboard/logging stuff
tensorboard: true
tensorboard_log_dir: data/wikitext-103-raw-v1/run/tensorboard
report_every: 100
# transforms related stuff
transforms: [onmt_tokenize, filtertoolong]
transforms_configs:
onmt_tokenize:
src_subword_type: bpe
src_subword_model: data/wikitext-103-raw-v1/subwords.bpe
src_onmttok_kwargs: {"mode": "aggressive", "joiner_annotate": True, "preserve_placeholders":
True, "case_markup": True, "soft_case_regions": True, "preserve_segmented_tokens":
True}
filtertoolong:
src_seq_length: 512
tgt_seq_length: 512
# Corpus opts:
data:
corpus_1:
path_src: data/wikitext-103-raw-v1/train.txt
valid:
path_src: data/wikitext-103-raw-v1/validation.txt
training:
# Train on a single GPU
world_size: 1
gpu_ranks: [0]
# Train on 2 gpus
# world_size: 2
# gpu_ranks: [0, 1]
# parallel_mode: tensor_parallel
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
batch_size: 2048
batch_type: tokens
compute_dtype: "fp32"
optim: "adam"
# compute_dtype: "fp16"
# optim: "fusedadam"
learning_rate: 2
warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
# label_smoothing: 0.1 # hindering loss/ppl values
param_init_method: xavier_uniform
normalization: "tokens"
# Where to save the checkpoints
model_path: data/wikitext-103-raw-v1/run/model-lm
save_checkpoint_steps: 5000
train_steps: 100000
valid_steps: 5000
model:
# Model
architecture: "transformer_lm"
layers: 6
heads: 8
hidden_size: 512
transformer_ff: 2048
embeddings:
word_vec_size: 512
position_encoding: true