澈-python3利用skip-gram实现词的相关性查找

2020年09月24日 13:14:03
python3利用skip-gram实现词的相关性查找

n-gram原理（ https://blog.csdn.net/baimafujinji/article/details/51281816）
#! -*- coding:utf-8 -*-
# 此函数作用是对初始语料进行分词处理后，作为训练模型的语料
import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
from gensim.models import word2vec
import gensim
import logging
import jieba
import os
import codecs

def cut_txt(old_file):

    # global cut_file     # 分词之后保存的文件名
    cut_file = old_file + '_cut.txt'

    try:
        fi = codecs.open(old_file, 'r',encoding='utf-8')
    except BaseException as e:  # 因BaseException是所有错误的基类，用它可以获得所有错误类型
        print(Exception, ":", e)    # 追踪错误详细信息

    text = fi.read()  # 获取文本内容
    new_text = jieba.cut(text, cut_all=False)  # 精确模式
    # str_out = ' '.join(new_text).replace('，', '').replace('。', '').replace('？', '').replace('！', '') \
    #     .replace('“', '').replace('”', '').replace('：', '').replace('…', '').replace('（', '').replace('）', '') \
    #     .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
    #     .replace('’', '')     # 去掉标点符号
    fo = codecs.open(cut_file, 'w',encoding='utf-8')
    fo.write(' '.join(new_text))


def model_train(train_file_name, save_model_file):  # model_file_name为训练语料的路径,save_model为保存模型名

    # 模型训练，生成词向量
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.Text8Corpus(train_file_name)  # 加载语料
    model = gensim.models.Word2Vec(sentences, size=200,min_count=1,window=3)  # 训练skip-gram模型; 默认window=5
    model.save(save_model_file)
    model.wv.save_word2vec_format(save_model_name + ".bin", binary=True)   # 以二进制类型保存模型以便重用





if __name__ == '__main__':
    # global cut_file
    #hsds.txt_cut.txt 已经切割好的文本
    #hsds.txt 原始文本
    cut_file='hsds.txt_cut.txt'
    if not os.path.exists(cut_file):    # 判断文件是否存在，参考：https://www.cnblogs.com/jhao/p/7243043.html
         cut_txt('hsds.txt')  # 须注意文件必须先另存为utf-8编码格式

    # cut_file='cutWord2.txt'
    #test_word2vec1.model 训练好的模型

    save_model_name = 'word2vec1.model'
    if not os.path.exists(save_model_name):  # 判断文件是否存在
        model_train(cut_file, save_model_name)
    else:
        print('此训练模型已经存在，不用再次训练')

    # 加载已训练好的模型
    model_1 = word2vec.Word2Vec.load(save_model_name)
    # 计算两个词的相似度/相关程度
    y1 = model_1.similarity("赵敏", "韦一笑")
    print(u"赵敏和韦一笑的相似度为：", y1)
    # print("-------------------------------\n")
    # 计算某个词的相关词列表
    y2 = model_1.most_similar(u"赵敏", topn=5)  # 10个最相关的
    # print(u"和apple最相关的词有：\n")
    for item in y2:
        print(item[0],item[1])

    print("-------------------------------\n")
一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30