python3实现英文关键词提取算法之RAKE
他山之石
算法讲解https://blog.csdn.net/qq_29003925/article/details/80943689( 本文的计算分数有点不一样 大部分过程一样的)
#!/usr/bin/python3.6 # coding: utf-8 # Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm # as described in: # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). # Automatic keyword extraction from indi-vidual documents. # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. import re import operator import collections class rake(): def __init__(self): pass def is_number(self,s): try: float(s) if '.' in s else int(s) return True except ValueError: return False def load_stop_words(self,stop_word_file): """ Utility function to load stop words from a file and return as a list of words @param stop_word_file Path and file name of a file containing stop words. @return list A list of stop words. """ stop_words = [] for line in open(stop_word_file): if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) return stop_words def separate_words(self,text, min_word_return_size): """ Utility function to return a list of all words that are have a length greater than a specified number of characters. @param text The text that must be split in to words. @param min_word_return_size The minimum no of characters a word must have to be included. """ splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') words = [] for single_word in splitter.split(text): current_word = single_word.strip().lower() # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases if len(current_word) > min_word_return_size and current_word != '' and not self.is_number(current_word): words.append(current_word) return words def split_sentences(self,text): """ Utility function to return a list of sentences. @param text The text that must be split in to sentences. """ sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') sentences = sentence_delimiters.split(text) return sentences def build_stop_word_regex(self,stop_word_file_path): stop_word_list = self.load_stop_words(stop_word_file_path) stop_word_regex_list = [] for word in stop_word_list: word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen stop_word_regex_list.append(word_regex) stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) return stop_word_pattern def generate_candidate_keywords(self,sentence_list, stopword_pattern): phrase_list = [] for s in sentence_list: # 用|替换正则匹配出来的东西 tmp = re.sub(stopword_pattern, '|', s.strip()) phrases = tmp.split("|") for phrase in phrases: phrase = phrase.strip().lower() if phrase != "": phrase_list.append(phrase) return phrase_list def calculate_word_scores(self,phraseList): word_frequency = {} word_degree = {} # print(phraseList) # exit() for phrase in phraseList: # 切割词组里面的单词 word_list = self.separate_words(phrase, 0) word_list_length = len(word_list) word_list_degree = word_list_length - 1 # if word_list_degree > 3: word_list_degree = 3 #exp. for word in word_list: # 统计某个词 在所有词组中出现的次数每次加1 word_frequency.setdefault(word, 0) word_frequency[word] += 1 # 统计某个词组中处了特定词以外 其他词的个数 word_degree.setdefault(word, 0) word_degree[word] += word_list_degree # orig. # word_degree[word] += 1/(word_list_length*1.0) #exp. for item in word_frequency: word_degree[item] = word_degree[item] + word_frequency[item] # Calculate Word scores = deg(w)/frew(w) word_score = {} for item in word_frequency: word_score.setdefault(item, 0) word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) # orig. # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp. return word_score def generate_candidate_keyword_scores(self,phrase_list, word_score): keyword_candidates = {} for phrase in phrase_list: keyword_candidates.setdefault(phrase, 0) # 把词组切割成单个单词 word_list = self.separate_words(phrase, 0) candidate_score = 0 for word in word_list: candidate_score += word_score[word] keyword_candidates[phrase] = candidate_score return keyword_candidates def get_keyword(self,text): # Split text into sentences(根据,!?等切割语句) sentenceList = self.split_sentences(text) # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1 stoppath = "SmartStoplist.txt" # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1 stopwordpattern = self.build_stop_word_regex(stoppath) # generate candidate keywords(根据停用词切割语句成单个词组) phraseList = self.generate_candidate_keywords(sentenceList, stopwordpattern) # calculate individual word scores(根据词频freq、度deg计算每个单词的得分) wordscores = self.calculate_word_scores(phraseList) # generate candidate keyword scores(根据每个词的得分计算词组的得分) keywordcandidates = self.generate_candidate_keyword_scores(phraseList, wordscores) #对结果排序 sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True) return sortedKeywords def index(self): str = "The Chinese mainland's popular hot pot chain Haidilao will stand at a listing hearing for formal approval of an initial public offering on the Hong Kong stock exchange next Thursday, Beijing Business Today reported." # 从标题提取出来的keyword title_keyword_list = self.get_keyword(str) print(title_keyword_list) exit() #结果:[('popular hot pot chain haidilao', 25.0), ('hong kong stock exchange', 16.0), ('beijing business today reported', 16.0), ('initial public offering', 9.0), ('chinese mainland', 4.0), ('listing hearing', 4.0), ('formal approval', 4.0), ('stand', 1.0), ('thursday', 1.0)] if __name__ == '__main__': rake = rake() rake.index()
SmartStoplist.txt为停用词表格式如下
#stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop a a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am .....