python3利用tf-idf实现文本相似度分析
利用了jieba分词和tf-idf算法实现文本相似度分析,
import jieba
from gensim import corpora,models,similarities
import sys
def do():
#两个源文件
f1 = "one.txt"
f2 = "two.txt"
#待对比文件(拿test和f1、f2分别进行对比)
test = "three.txt"
# 读取文件内容
c1 = readFile(f1)
c2 = readFile(f2)
doc_test = readFile(test)
all_doc = []
all_doc.append(c1)
all_doc.append(c2)
all_doc_list = []
for doc in all_doc:
doc_list = [word for word in jieba.cut(doc)]
doc_list = delStopWord(doc_list)
all_doc_list.append(doc_list)
dictionary = corpora.Dictionary(all_doc_list)
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
doc_test_list = [word for word in jieba.cut(doc_test)]
doc_test_list = delStopWord(doc_test_list)
doc_test_vec = dictionary.doc2bow(doc_test_list)
tfidf = models.TfidfModel(corpus)
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
dd = sorted(enumerate(sim), key=lambda item: -item[1])
print(dd)
#[(1, 0.79372525), (0, 0.0)] 结果如下 1表示第几个文本,0.79372525表示相似度
#去除停用词
def delStopWord(data=1):
#停用词文档网上一大堆复制下来换行符隔开
stopWordPath = "stopword.txt"
str = readFile(stopWordPath,'GBK')
stopWordData = str.split('\n')
temp = []
for dd in data:
if dd not in stopWordData:
temp.append(dd)
return temp
#
def readFile(fileName,encoding='utf-8'):
return open(fileName, encoding=encoding).read()
if __name__=='__main__':
do()