forked from Mryangkaitong/Chinese_NRE
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword2vec_train.py
54 lines (54 loc) · 2.66 KB
/
word2vec_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#coding:utf8
import os
import re
import jieba
import logging
from gensim.models import word2vec
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#训练word2vec模型时的进程数
workers = 5
#主目录
baseDir = os.path.dirname(os.path.abspath(__name__))
modelDir = baseDir+'/word2vec_model'
if not os.path.exists(modelDir):
os.makedirs(modelDir)
################################################ 使用jieba进行分词,制作语料库 ################################################
#关键词
#jieba.set_dictionary(baseDir+'/extra_dict/dict.txt.big')
#停用词
stop_word_set = set()
# 严格限制标点符号
strict_punctuation = '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸﹀︺︾ˉ﹂﹄︼'
# 简单限制标点符号
simple_punctuation = '’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
# 去除标点符号
punctuation = simple_punctuation + strict_punctuation
# with open(baseDir+'/extra_dict/stop_words.txt', 'r', encoding='utf-8') as sw:
# for line in sw:
# stop_word_set.add(line.strip('\n'))
texts_num = 0
#语料库保存目录
output = open(baseDir+'/data/corpus.txt', 'w', encoding='utf-8')
with open(baseDir+'/data/text.txt', 'r', encoding='utf-8') as content:
for line in content:
#去除比标点符号
#line = re.sub('[{0}]+'.format(punctuation), '', line.strip('\n'))
line = line.strip('\n')
words = jieba.cut(line, cut_all=False)
for word in words:
#if word not in stop_word_set:
output.write(word + ' ')
texts_num += 1
if texts_num % 1000000 == 0:
logging.info("已完成前 %d 行的分词" % texts_num)
output.close()
logging.info("语料库制作完成 !!!!!")
################################################ 加载处理好的语料库,训练word2vec模型 ################################################
logging.info("训练word2vec中..................")
sentences = word2vec.Text8Corpus(baseDir+'/data/corpus.txt')
model = word2vec.Word2Vec(sentences,sg=1,size=300,window=5,min_count=10,negative=5,sample=1e-4,workers=workers)
model.wv.save_word2vec_format(modelDir+'word2vec.txt',binary = False)
logging.info("成功,结束 !!")
if __name__ == '__main__':
main()