Skip to content

Commit

Permalink
update leaderboard across 8 tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
mkshing committed Aug 10, 2023
1 parent b7b2ede commit d4bdbfa
Show file tree
Hide file tree
Showing 50 changed files with 671 additions and 18 deletions.
36 changes: 18 additions & 18 deletions README.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions models/llama2/llama2-13b-chat/harness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True"
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b-chat/result.json" --batch_size 2 > models/llama2/llama2-13b-chat/harness.out 2> models/llama2/llama2-13b-chat/harness.err

71 changes: 71 additions & 0 deletions models/llama2/llama2-13b-chat/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"results": {
"jsquad-1.1-0.3": {
"exact_match": 67.69473210265646,
"f1": 82.68867939081463
},
"jcommonsenseqa-1.1-0.3": {
"acc": 0.7256478999106345,
"acc_stderr": 0.01334431281465833,
"acc_norm": 0.3967828418230563,
"acc_norm_stderr": 0.01463161897855815
},
"jnli-1.1-0.3": {
"acc": 0.3562037797863599,
"acc_stderr": 0.009708506341194316,
"acc_norm": 0.3648315529991783,
"acc_norm_stderr": 0.00975932091977734
},
"marc_ja-1.1-0.3": {
"acc": 0.5992217898832685,
"acc_stderr": 0.006517879943818406,
"acc_norm": 0.5992217898832685,
"acc_norm_stderr": 0.006517879943818406
},
"jaqket_v2-0.1-0.3": {
"exact_match": 48.1958762886598,
"f1": 63.75233331776556
},
"xlsum_ja-1.0-0.3": {
"rouge2": 15.14282905950018
},
"mgsm-1.0-0.3": {
"acc": 0.132,
"acc_stderr": 0.021450980824038107
},
"xwinograd_ja": {
"acc": 0.6381647549530761,
"acc_stderr": 0.015525267319875928
}
},
"versions": {
"jsquad-1.1-0.3": 1.1,
"jcommonsenseqa-1.1-0.3": 1.1,
"jnli-1.1-0.3": 1.1,
"marc_ja-1.1-0.3": 1.1,
"jaqket_v2-0.1-0.3": 0.1,
"xlsum_ja-1.0-0.3": 1.0,
"xwinograd_ja": 1.0,
"mgsm-1.0-0.3": 1.0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True",
"num_fewshot": [
2,
3,
3,
3,
1,
1,
0,
5
],
"batch_size": 2,
"device": "cuda",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
4 changes: 4 additions & 0 deletions models/llama2/llama2-13b/harness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True"

TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b/result.json" --batch_size 2 > models/llama2/llama2-13b/harness.out 2> models/llama2/llama2-13b/harness.err
71 changes: 71 additions & 0 deletions models/llama2/llama2-13b/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"results": {
"jsquad-1.1-0.3": {
"exact_match": 76.13687528140477,
"f1": 86.24170446058177
},
"jcommonsenseqa-1.1-0.3": {
"acc": 0.7488829311885612,
"acc_stderr": 0.012969528294765333,
"acc_norm": 0.40035746201966044,
"acc_norm_stderr": 0.014653766897279888
},
"jnli-1.1-0.3": {
"acc": 0.2198027937551356,
"acc_stderr": 0.008395522792803168,
"acc_norm": 0.30156121610517667,
"acc_norm_stderr": 0.009304239098715018
},
"marc_ja-1.1-0.3": {
"acc": 0.38892819243013793,
"acc_stderr": 0.006483975178620039,
"acc_norm": 0.38892819243013793,
"acc_norm_stderr": 0.006483975178620039
},
"jaqket_v2-0.1-0.3": {
"exact_match": 67.69759450171821,
"f1": 74.62526066907506
},
"xlsum_ja-1.0-0.3": {
"rouge2": 18.110069857141642
},
"mgsm-1.0-0.3": {
"acc": 0.1,
"acc_stderr": 0.01901172751573437
},
"xwinograd_ja": {
"acc": 0.6287799791449427,
"acc_stderr": 0.015609259235278878
}
},
"versions": {
"jsquad-1.1-0.3": 1.1,
"jcommonsenseqa-1.1-0.3": 1.1,
"jnli-1.1-0.3": 1.1,
"marc_ja-1.1-0.3": 1.1,
"jaqket_v2-0.1-0.3": 0.1,
"xlsum_ja-1.0-0.3": 1.0,
"xwinograd_ja": 1.0,
"mgsm-1.0-0.3": 1.0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True",
"num_fewshot": [
2,
3,
3,
3,
1,
1,
0,
5
],
"batch_size": 2,
"device": "cuda",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
5 changes: 5 additions & 0 deletions models/llama2/llama2-2.7b/harness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True"
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3" --device "cuda" --output_path "models/llama2/llama2-2.7b/result.json" --batch_size 2 > models/llama2/llama2-2.7b/harness.out 2> models/llama2/llama2-2.7b/harness.err


48 changes: 48 additions & 0 deletions models/llama2/llama2-2.7b/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"results": {
"jsquad-1.1-0.3": {
"exact_match": 58.37460603331832,
"f1": 69.51836154287909
},
"jcommonsenseqa-1.1-0.3": {
"acc": 0.5263628239499554,
"acc_stderr": 0.014932915029029303,
"acc_norm": 0.291331546023235,
"acc_norm_stderr": 0.013589216112682911
},
"jnli-1.1-0.3": {
"acc": 0.28225143796220215,
"acc_stderr": 0.009125006713744669,
"acc_norm": 0.30156121610517667,
"acc_norm_stderr": 0.009304239098715018
},
"marc_ja-1.1-0.3": {
"acc": 0.8604527767951893,
"acc_stderr": 0.004608765667738413,
"acc_norm": 0.8604527767951893,
"acc_norm_stderr": 0.004608765667738413
}
},
"versions": {
"jsquad-1.1-0.3": 1.1,
"jcommonsenseqa-1.1-0.3": 1.1,
"jnli-1.1-0.3": 1.1,
"marc_ja-1.1-0.3": 1.1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True",
"num_fewshot": [
2,
3,
3,
3
],
"batch_size": 2,
"device": "cuda",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
5 changes: 5 additions & 0 deletions models/llama2/llama2-7b-chat/harness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True"
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.json" --batch_size 2 > models/llama2/llama2-7b-chat/harness.out 2> models/llama2/llama2-7b-chat/harness.err


71 changes: 71 additions & 0 deletions models/llama2/llama2-7b-chat/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"results": {
"jsquad-1.1-0.3": {
"exact_match": 59.34263845114813,
"f1": 73.13860295063034
},
"jcommonsenseqa-1.1-0.3": {
"acc": 0.5558534405719392,
"acc_stderr": 0.014860122802670312,
"acc_norm": 0.30831099195710454,
"acc_norm_stderr": 0.013811124479483027
},
"jnli-1.1-0.3": {
"acc": 0.2953985209531635,
"acc_stderr": 0.00924921508921067,
"acc_norm": 0.3175842235004108,
"acc_norm_stderr": 0.009438064365860652
},
"marc_ja-1.1-0.3": {
"acc": 0.9041386628935267,
"acc_stderr": 0.00391561306533889,
"acc_norm": 0.9041386628935267,
"acc_norm_stderr": 0.00391561306533889
},
"jaqket_v2-0.1-0.3": {
"exact_match": 17.9553264604811,
"f1": 31.006768969536488
},
"xlsum_ja-1.0-0.3": {
"rouge2": 2.339856054050597
},
"mgsm-1.0-0.3": {
"acc": 0.092,
"acc_stderr": 0.018316275379429644
},
"xwinograd_ja": {
"acc": 0.6611053180396246,
"acc_stderr": 0.015292727421996942
}
},
"versions": {
"jsquad-1.1-0.3": 1.1,
"jcommonsenseqa-1.1-0.3": 1.1,
"jnli-1.1-0.3": 1.1,
"marc_ja-1.1-0.3": 1.1,
"jaqket_v2-0.1-0.3": 0.1,
"xlsum_ja-1.0-0.3": 1.0,
"xwinograd_ja": 1.0,
"mgsm-1.0-0.3": 1.0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True",
"num_fewshot": [
2,
3,
3,
3,
1,
1,
0,
5
],
"batch_size": 2,
"device": "cuda",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
5 changes: 5 additions & 0 deletions models/llama2/llama2-7b/harness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True"
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b/result.json" --batch_size 2 > models/llama2/llama2-7b/harness.out 2> models/llama2/llama2-7b/harness.err


71 changes: 71 additions & 0 deletions models/llama2/llama2-7b/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"results": {
"jsquad-1.1-0.3": {
"exact_match": 58.39711841512832,
"f1": 69.52916111780529
},
"jcommonsenseqa-1.1-0.3": {
"acc": 0.5263628239499554,
"acc_stderr": 0.014932915029029303,
"acc_norm": 0.29222520107238603,
"acc_norm_stderr": 0.013601458439195222
},
"jnli-1.1-0.3": {
"acc": 0.28225143796220215,
"acc_stderr": 0.009125006713744669,
"acc_norm": 0.30156121610517667,
"acc_norm_stderr": 0.009304239098715018
},
"marc_ja-1.1-0.3": {
"acc": 0.8604527767951893,
"acc_stderr": 0.004608765667738413,
"acc_norm": 0.8604527767951893,
"acc_norm_stderr": 0.004608765667738413
},
"jaqket_v2-0.1-0.3": {
"exact_match": 38.83161512027491,
"f1": 43.653527171568406
},
"xlsum_ja-1.0-0.3": {
"rouge2": 9.32010216666052
},
"mgsm-1.0-0.3": {
"acc": 0.056,
"acc_stderr": 0.014570697336899597
},
"xwinograd_ja": {
"acc": 0.6465067778936392,
"acc_stderr": 0.015445228301221376
}
},
"versions": {
"jsquad-1.1-0.3": 1.1,
"jcommonsenseqa-1.1-0.3": 1.1,
"jnli-1.1-0.3": 1.1,
"marc_ja-1.1-0.3": 1.1,
"jaqket_v2-0.1-0.3": 0.1,
"xlsum_ja-1.0-0.3": 1.0,
"xwinograd_ja": 1.0,
"mgsm-1.0-0.3": 1.0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True",
"num_fewshot": [
2,
3,
3,
3,
1,
1,
0,
5
],
"batch_size": 2,
"device": "cuda",
"no_cache": false,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
set -eu
PROJECT_DIR= # XXX set your own project dir
MODEL_ARGS="pretrained=${PROJECT_DIR}/hf_model/stablelm-ja-base-alpha-7b,tokenizer=${PROJECT_DIR}/tokenizers/nai-hf-tokenizer/,use_fast=False,trust_remote_code=True"
TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
NUM_FEW_SHOTS="3,3,3,2,1,1,0,5"
python main.py \
--model hf-causal \
--model_args $MODEL_ARGS \
--tasks $TASK \
--num_fewshot $NUM_FEW_SHOTS \
--device "cuda" \
--output_path "models/stablelm/stablelm-ja-base-alpha-7b/result.json"
Loading

0 comments on commit d4bdbfa

Please sign in to comment.