python3利用结巴分词实现智能问答
本文利用结巴分词计算两个句子之间的tfidf的余弦相似度,可以用在客服的机器回答一些常见文件上面
import jieba import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import numpy as np from scipy.linalg import norm punc = "!?。?"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." stoplist = {}.fromkeys([line.strip() for line in open("stoplist.txt",errors="ignore")]) def index(): query = "借书的一般程序是什么" #替换标点 query_del_punc= re.sub(u"[%s]+" % punc, "", query) #结巴分词 query_cut = jieba.cut(query_del_punc, cut_all=False) #剔除停用词 query_segs = [word for word in query_cut if word not in stoplist] qu_str= ' '.join(query_segs) getQuestion(qu_str) def cosine_similarity_tfidf(s1, s2): """ 计算两个句子的TFIDF余弦相似度 :param s1: :param s2: :return: """ vectorizer = TfidfVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = vectorizer.fit_transform(corpus).toarray() return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) def getQuestion(qu_str): data = ["借书的一般程序是什么","怎样借书","怎样追女神"] list_smi_score = [] i = 0 for line in data: i = i + 1 line_del_punc = re.sub(u"[%s]+" % punc, "", line) line_cut = jieba.cut(line_del_punc, cut_all=False) an_segs = [word for word in line_cut if word not in stoplist] an_str = ' '.join(an_segs) score = cosine_similarity_tfidf(qu_str, an_str) list_smi_score.append((score, i)) print(list_smi_score) #[(1.0000000000000002, 1), (0.26055567105626243, 2), (0.0, 3)] exit() index()
可以看见第一个的相似度最高