From 2b7923e09fdb2d32aba9a2fdd8814f2dcd80578e Mon Sep 17 00:00:00 2001 From: Achyudh Ram <7617287+achyudh@users.noreply.github.com> Date: Sat, 13 Apr 2019 23:25:30 -0400 Subject: [PATCH 01/22] Integrate BERT into Hedwig (#29) * Fix package imports * Update README.md * Fix bug due to TAR/AR attribute check * Add BERT models * Add BERT tokenizer * Return logits from the model.py * Remove unused classes in models/bert * Return logits from the model.py (#12) * Remove unused classes in models/bert (#13) * Add initial main file * Add args for BERT * Add partial support for BERT * Initialize training and optimization * Draft the structure of Trainers for BERT * Remove duplicate tokenizer * Add utils * Move optimization to utils * Add more structure for trainer * Refactor the trainer (#15) * Refactor the trainer * Add more edits * Add support for our datasets * Add evaluator * Split data4bert module into multiple processors * Refactor BERT tokenizer * Integrate BERT into Castor framework (#17) * Remove unused classes in models/bert * Split data4bert module into multiple processors * Refactor BERT tokenizer * Add multilabel support in BertTrainer * Add multilabel support in BertEvaluator * Add get_test_samples method in dataset processors * Fix args.py for BERT * Add support for Reuters, IMDB datasets for BERT * Revert "Integrate BERT into Castor framework (#17)" This reverts commit e4244ec73950d1efb15f706de6a4c77988c821ba. * Fix paths to datasets in dataset classes and args * Add SST dataset * Add hedwig-data instructions to README.md * Fix KimCNN README * Fix RegLSTM README * Fix typos in README * Remove trec_eval from README * Add tensorboardX to requirements.txt * Rename processors module to bert_processors * Add method to print metrics after training * Add model check-pointing and early stopping for BERT * Add logos * Update README.md * Fix code comments in classification trainer * Add support for AAPD, Sogou, AGNews and Yelp2014 * Fix bug that deleted saved models * Update README for HAN * Update README for XML-CNN * Remove redundant TODOs from the READMEs * Fix logo in README.md * Update README for Char-CNN * Fix all the READMEs * Resolve conflict * Fix Typos * Re-Add SST2 Processor * Add support for evaluating trained model * Update args.py * Resolve issues due to DataParallel wrapper on saved model * Remove redundant Yelp processor * Fix bug for safely creating the saving directory * Change checkpoint paths to timestamps * Remove unwanted string.strip() from tokenizer * Create save path if it doesn't exist * Decouple model checkpoints from code * Remove model choice restrictions for BERT * Remove model/distill driver * Simplify checkpoint directory creation --- README.md | 54 +- __init__.py | 4 +- common/evaluators/bert_evaluator.py | 80 ++ common/evaluators/classification_evaluator.py | 9 +- common/trainers/bert_trainer.py | 118 +++ common/trainers/classification_trainer.py | 23 +- datasets/aapd.py | 10 +- datasets/bert_processors/__init__.py | 0 datasets/bert_processors/aapd_processor.py | 33 + .../bert_processors/abstract_processor.py | 193 ++++ datasets/bert_processors/agnews_processor.py | 34 + datasets/bert_processors/imdb_processor.py | 34 + datasets/bert_processors/reuters_processor.py | 33 + datasets/bert_processors/sogou_processor.py | 34 + datasets/bert_processors/sst_processor.py | 39 + .../bert_processors/yelp2014_processor.py | 34 + datasets/imdb.py | 6 +- datasets/reuters.py | 17 +- datasets/sst.py | 92 ++ datasets/yelp2014.py | 8 +- docs/hedwig.png | Bin 0 -> 23564 bytes models/args.py | 2 +- models/bert/__init__.py | 0 models/bert/__main__.py | 169 ++++ models/bert/args.py | 43 + models/bert/model.py | 851 ++++++++++++++++++ models/char_cnn/README.md | 44 +- models/char_cnn/__main__.py | 5 + models/char_cnn/args.py | 4 +- models/han/README.md | 46 +- models/han/__main__.py | 7 +- models/han/args.py | 4 +- models/kim_cnn/README.md | 130 +-- models/kim_cnn/__main__.py | 5 + models/kim_cnn/args.py | 4 +- models/reg_lstm/README.md | 54 +- models/reg_lstm/__main__.py | 5 + models/reg_lstm/args.py | 4 +- models/xml_cnn/README.md | 32 +- models/xml_cnn/__main__.py | 11 +- models/xml_cnn/args.py | 5 +- requirements.txt | 1 + utils/io.py | 257 ++++++ utils/optimization.py | 179 ++++ utils/tokenization.py | 387 ++++++++ 45 files changed, 2808 insertions(+), 296 deletions(-) create mode 100644 common/evaluators/bert_evaluator.py create mode 100644 common/trainers/bert_trainer.py create mode 100644 datasets/bert_processors/__init__.py create mode 100644 datasets/bert_processors/aapd_processor.py create mode 100644 datasets/bert_processors/abstract_processor.py create mode 100644 datasets/bert_processors/agnews_processor.py create mode 100644 datasets/bert_processors/imdb_processor.py create mode 100644 datasets/bert_processors/reuters_processor.py create mode 100644 datasets/bert_processors/sogou_processor.py create mode 100644 datasets/bert_processors/sst_processor.py create mode 100644 datasets/bert_processors/yelp2014_processor.py create mode 100644 datasets/sst.py create mode 100644 docs/hedwig.png create mode 100644 models/bert/__init__.py create mode 100644 models/bert/__main__.py create mode 100644 models/bert/args.py create mode 100644 models/bert/model.py create mode 100644 utils/io.py create mode 100644 utils/optimization.py create mode 100644 utils/tokenization.py diff --git a/README.md b/README.md index e7048f6..3530ba6 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,21 @@ -# Hedwig +
+
+
@Hx=WA|!sr^XDV+lb
zjOHDDpZD{7-rxR!?c8(DJ@?c-w<2}4l*vgLNB{r;xvI)bT>yZP2>`%py+eqFaEw!a
z!~Uc9Rx #u`MqCb_|#nI~D
z%^Ao6bZbq*41f?4$((xv;h9`r;c1%pyi *(#gb()&(k
zEPqqzOXQE?{t?3uHejD&SloiFpIo2|&SXyZepE>JH-5hs#z~Kmi>DGPR~37$0kn;-
z5JESqtZ_;zcjR%)=)2N%X>JEI0p!R`9>avyyH7npX@%#f3~soO
zwZnt-gR>I(%}6!@)
L(e0``(c%$k)Sps)Z2tZ=yjcrkcy6)IU@vBN|56&e(-}c@SrOEkJ>}5aw>D3j
zt18ia74Lk~zyKpay*gvA(;!?G1#Z0l4%2?NLh*5FqL(6~)zfvQN@7O4fB10hjA1Jz
zOX`fr+-@1W8g@k#fkzESjzbR4KU?>E8ThU!ajjGXQKO#zCd6rR7%}y_2&oo|BoIYUc9~
z#C_{?-X7ZCNqUJm>ti$j!18jkkQv*ROT7`6e{2sG3=pTuDLKmB>+D3LCl+}6U_
z8xMyh;^c>Jvw$D%9H7lz(Q$%&AA@#5d%z3b$pD>00@{09tCEvVEoqE(s2AppVn=5|
zj=MdPxiUhpMsUs=7>aR%lQWDKOB(w5>(19t