-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp_db.py
131 lines (131 loc) · 5.95 KB
/
nlp_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
nlp_db = [{
"name": "SMP2017ECDT-DATA",
"github_link": "https://github.com/HITlilingzhi/SMP2017ECDT-DATA",
"local_path": "~/Studio/nlp_db/SMP2017ECDT-DATA-master/",
"citation":
"Zhang W N, Chen Z, Che W, et al. The First Evaluation of Chinese Human-Computer Dialogue Technology[J]. 2017.",
"paper": "https://arxiv.org/abs/1709.10217"
}, {
"homepage": "http://www.nlpir.org/wordpress/",
"corpus": {
"page":
"http://www.nlpir.org/wordpress/category/corpus%e8%af%ad%e6%96%99%e5%ba%93/",
"data": [{
"name":
"文本分类语料库(复旦)测试语料",
"description":
"由复旦大学李荣陆提供。answer.rar为测试语料,共9833篇文档;train.rar为训练语料,共9804篇文档,分为20个类别。训练语料和测试语料基本按照1:1的比例来划分。收集工作花费了不少人力和物力,所以请大家在使用时尽量注明来源(复旦大学计算机信息与技术系国际数据库中心自然语言处理小组)。",
"download":
"http://www.nlpir.org/wordpress/download/tc-corpus-answer.rar"
}, {
"name":
"中文新闻分类语料库",
"citation":
"语料库作者: 刘禹 中国科学院自动化研究所综合信息中心 电子邮件[email protected]",
"download":
"http://download.cnblogs.com/finallyliuyu/corpus.rar"
}]
}
}, {
"name": "中文任务基准测评",
"homepage": "https://www.cluebenchmarks.com/index.html",
"name": "CLUE benchmark",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"local_path":"nlp_db",
"corpus": {
"data": [
{
"name": "AFQMC 蚂蚁金融语义相似度",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/afqmc_public.zip",
"local_path": "afqmc_public"
},
{
"name": "TNEWS' 今日头条中文新闻(短文)分类",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip",
"local_path": "tnews_public"
},
{
"name": "IFLYTEK' 长文本分类",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip",
"local_path": "iflytek_public"
},
{
"name": "CMNLI 语言推理任务",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/cmnli_public.zip",
"local_path": "cmnli_public"
},
{
"name": "WSC Winograd模式挑战中文版",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/cluewsc2020_public.zip",
"local_path": "cluewsc2020_public"
},
{
"name": "CSL 论文关键词识别",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/csl_public.zip",
"local_path": "csl_public"
},
{
"name": "CMRC2018 简体中文阅读理解任务",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/cmrc2018_public.zip",
"local_path": "cmrc2018_public"
},
{
"DRCD 繁体阅读理解任务 Reading Comprehension for Traditional Chinese"
"github_link":
"",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/drcd_public.zip",
"local_path":
"drcd_public"
},
{
"name": "CHID 成语阅读理解填空",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/chid_public.zip",
"local_path": "chid_public"
},
{
"name": "C3 中文多选阅读理解",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/c3_public.zip",
"local_path": "c3_public"
},
{
"name":
"诊断集 CLUE_diagnostics test_set",
"local_path":
"clue_diagnostics_public",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/clue_diagnostics_public.zip"
},
{
"name": "CLUENER 细粒度命名实体识别",
"github_link": "https://github.com/CLUEbenchmark/CLUE",
"download":
"https://storage.googleapis.com/cluebenchmark/tasks/cluener_public.zip",
"local_path": "cluener_public"
},
]
}
}, {
"desc":
"本语料库由复旦大学李荣陆提供。test_corpus.rar为测试语料,共9833篇文档;train_corpus.rar为训练语料,共9804篇文档,两个预料各分为20个相同类别。训练语料和测试语料基本按照1:1的比例来划分。使用时尽量注明来源(复旦大学计算机信息与技术系国",
"download": "https://pan.baidu.com/s/1833mT2rhL6gBMlM0KnmyKg",
"download_pwd": "zyxa"
}]