1、下载维基百科data
https://dumps.wikimedia.org/zhwiki/20190820/
zhwiki-20190820-pages-articles.xml.bz2
也可以下下面的小的
2、将bz2内容提取出来
确保安装了gensim pip install gensim
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#将xml的wiki数据转换为text格式
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])#得到文件名
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
if len(sys.argv) < 3:
#print globals()['__doc__'] % locals()
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w',encoding='utf-8')
wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus
for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
output.write(space.join(text) + "\n")
i = i+1
if (i % 10000 == 0):
logger.info("Saved "+str(i)+" articles.")
output.close()
logger.info("Finished Saved "+str(i)+" articles.")
#python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt
进入该目录输入(重命名一下确保名字是一样的)
python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt
得到wiki.zh.txt
可以用test测试一下(文件比较大,直接打开比较慢)
import codecs,sys
f = codecs.open('wiki.zh.txt','r',encoding="utf-8")
line = f.readline()
print(line)
#python test.py
3、将繁体转为简体
默认为繁体字,转为简体字自己找一个opencc1.0.1吧
链接:https://pan.baidu.com/s/1e_qPxWzAZS74s4aIEEQWSA
提取码:by1b
解压后将wiki.zh.txt放进去,进入该目录
cd /d X:\opencc-1.0.1-win64
输入
opencc -i wiki.zh.txt -o wiki.zh.simp.txt -c t2s.json
可以得到简体中文版wiki.zh.simp.txt可以使用test.py检验一下。
4、分词
使用jieba分词器pip install jieba
进入wiki.zh.simp.txt所在目录
#jieba.py
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs,sys
def cut_words(sentence):
return " ".join(jieba.cut(sentence)).encode('utf-8')
f = codecs.open('wiki.zh.simp.txt','r',encoding='utf-8')
target = codecs.open('wiki.zh.simp.seg.txt','w',encoding='utf-8')
print('open files')
line_num = 1
line = f.readline()
while line:
print('----processing',line_num,'article--------------------')
line_seg = " ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num+1
line = f.readline()
f.close()
target.close()
exit()
#python jieba.py
得到分好词的wiki.zh.simp.seg.txt
5、生成Model
import logging
import multiprocessing
import os.path
import sys
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)z
logger.info("running %s" % ' '.join(sys.argv))
#check and process input arguments
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
sys.exit(1)
input_dir, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(PathLineSentences(input_dir),
size=256, window=10, min_count=5,
workers=multiprocessing.cpu_count(), iter=10)
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
#python word2vec_model.py wiki.zh.simp.seg.txt wiki.zh.text.model wiki.zh.text.vector
生成4个文件
6、测试Model
from gensim.models import Word2Vec
en_wiki_word2vec_model = Word2Vec.load('wiki.zh.text.model')
testwords = ['鼠标','编程','杯子','实验室','牛奶']
for i in range(5):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print (testwords[i])
print (res)
可以得到相关的词以及关联度。