python3利用tf-idf实现文本相似度分析
利用了jieba分词和tf-idf算法实现文本相似度分析,
import jieba from gensim import corpora,models,similarities import sys def do(): #两个源文件 f1 = "one.txt" f2 = "two.txt" #待对比文件(拿test和f1、f2分别进行对比) test = "three.txt" # 读取文件内容 c1 = readFile(f1) c2 = readFile(f2) doc_test = readFile(test) all_doc = [] all_doc.append(c1) all_doc.append(c2) all_doc_list = [] for doc in all_doc: doc_list = [word for word in jieba.cut(doc)] doc_list = delStopWord(doc_list) all_doc_list.append(doc_list) dictionary = corpora.Dictionary(all_doc_list) corpus = [dictionary.doc2bow(doc) for doc in all_doc_list] doc_test_list = [word for word in jieba.cut(doc_test)] doc_test_list = delStopWord(doc_test_list) doc_test_vec = dictionary.doc2bow(doc_test_list) tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) sim = index[tfidf[doc_test_vec]] dd = sorted(enumerate(sim), key=lambda item: -item[1]) print(dd) #[(1, 0.79372525), (0, 0.0)] 结果如下 1表示第几个文本,0.79372525表示相似度 #去除停用词 def delStopWord(data=1): #停用词文档网上一大堆复制下来换行符隔开 stopWordPath = "stopword.txt" str = readFile(stopWordPath,'GBK') stopWordData = str.split('\n') temp = [] for dd in data: if dd not in stopWordData: temp.append(dd) return temp # def readFile(fileName,encoding='utf-8'): return open(fileName, encoding=encoding).read() if __name__=='__main__': do()