python3利用结巴分词实现智能问答
本文利用结巴分词计算两个句子之间的tfidf的余弦相似度,可以用在客服的机器回答一些常见文件上面
import jieba
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from scipy.linalg import norm
punc = "!?。?"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
stoplist = {}.fromkeys([line.strip() for line in open("stoplist.txt",errors="ignore")])
def index():
query = "借书的一般程序是什么"
#替换标点
query_del_punc= re.sub(u"[%s]+" % punc, "", query)
#结巴分词
query_cut = jieba.cut(query_del_punc, cut_all=False)
#剔除停用词
query_segs = [word for word in query_cut if word not in stoplist]
qu_str= ' '.join(query_segs)
getQuestion(qu_str)
def cosine_similarity_tfidf(s1, s2):
"""
计算两个句子的TFIDF余弦相似度
:param s1:
:param s2:
:return:
"""
vectorizer = TfidfVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = vectorizer.fit_transform(corpus).toarray()
return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
def getQuestion(qu_str):
data = ["借书的一般程序是什么","怎样借书","怎样追女神"]
list_smi_score = []
i = 0
for line in data:
i = i + 1
line_del_punc = re.sub(u"[%s]+" % punc, "", line)
line_cut = jieba.cut(line_del_punc, cut_all=False)
an_segs = [word for word in line_cut if word not in stoplist]
an_str = ' '.join(an_segs)
score = cosine_similarity_tfidf(qu_str, an_str)
list_smi_score.append((score, i))
print(list_smi_score)
#[(1.0000000000000002, 1), (0.26055567105626243, 2), (0.0, 3)]
exit()
index()可以看见第一个的相似度最高