澈-python3实现英文关键词提取算法之RAKE

2018年11月28日 16:39:56

python3实现英文关键词提取算法之RAKE

他山之石

算法讲解https://blog.csdn.net/qq_29003925/article/details/80943689（本文的计算分数有点不一样大部分过程一样的）

#!/usr/bin/python3.6
# coding: utf-8
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
# Automatic keyword extraction from indi-vidual documents. 
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

import re
import operator
import collections


class rake():
    def __init__(self):
        pass

    def is_number(self,s):
        try:
            float(s) if '.' in s else int(s)
            return True
        except ValueError:
            return False

    def load_stop_words(self,stop_word_file):
        """
        Utility function to load stop words from a file and return as a list of words
        @param stop_word_file Path and file name of a file containing stop words.
        @return list A list of stop words.
        """
        stop_words = []
        for line in open(stop_word_file):
            if line.strip()[0:1] != "#":
                for word in line.split():  # in case more than one per line
                    stop_words.append(word)
        return stop_words

    def separate_words(self,text, min_word_return_size):

        """
        Utility function to return a list of all words that are have a length greater than a specified number of characters.
        @param text The text that must be split in to words.
        @param min_word_return_size The minimum no of characters a word must have to be included.
        """
        splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
        words = []

        for single_word in splitter.split(text):
            current_word = single_word.strip().lower()

            # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
            if len(current_word) > min_word_return_size and current_word != '' and not self.is_number(current_word):
                words.append(current_word)

        return words

    def split_sentences(self,text):
        """
        Utility function to return a list of sentences.
        @param text The text that must be split in to sentences.
        """
        sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
        sentences = sentence_delimiters.split(text)
        return sentences

    def build_stop_word_regex(self,stop_word_file_path):

        stop_word_list = self.load_stop_words(stop_word_file_path)

        stop_word_regex_list = []
        for word in stop_word_list:
            word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
            stop_word_regex_list.append(word_regex)

        stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)

        return stop_word_pattern

    def generate_candidate_keywords(self,sentence_list, stopword_pattern):

        phrase_list = []
        for s in sentence_list:
            # 用|替换正则匹配出来的东西
            tmp = re.sub(stopword_pattern, '|', s.strip())

            phrases = tmp.split("|")
            for phrase in phrases:
                phrase = phrase.strip().lower()
                if phrase != "":
                    phrase_list.append(phrase)

        return phrase_list

    def calculate_word_scores(self,phraseList):

        word_frequency = {}
        word_degree = {}
        # print(phraseList)
        # exit()
        for phrase in phraseList:
            # 切割词组里面的单词
            word_list = self.separate_words(phrase, 0)

            word_list_length = len(word_list)

            word_list_degree = word_list_length - 1

            # if word_list_degree > 3: word_list_degree = 3 #exp.
            for word in word_list:
                # 统计某个词 在所有词组中出现的次数每次加1
                word_frequency.setdefault(word, 0)
                word_frequency[word] += 1

                # 统计某个词组中处了特定词以外 其他词的个数
                word_degree.setdefault(word, 0)
                word_degree[word] += word_list_degree  # orig.
                # word_degree[word] += 1/(word_list_length*1.0) #exp.

        for item in word_frequency:
            word_degree[item] = word_degree[item] + word_frequency[item]

        # Calculate Word scores = deg(w)/frew(w)
        word_score = {}
        for item in word_frequency:
            word_score.setdefault(item, 0)
            word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  # orig.
        # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
        return word_score

    def generate_candidate_keyword_scores(self,phrase_list, word_score):

        keyword_candidates = {}
        for phrase in phrase_list:

            keyword_candidates.setdefault(phrase, 0)

            # 把词组切割成单个单词
            word_list = self.separate_words(phrase, 0)
            candidate_score = 0
            for word in word_list:
                candidate_score += word_score[word]
            keyword_candidates[phrase] = candidate_score

        return keyword_candidates

    def get_keyword(self,text):

        # Split text into sentences（根据,!?等切割语句）
        sentenceList = self.split_sentences(text)


        # stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
        stoppath = "SmartStoplist.txt"  # SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
        stopwordpattern = self.build_stop_word_regex(stoppath)

        # generate candidate keywords(根据停用词切割语句成单个词组)
        phraseList = self.generate_candidate_keywords(sentenceList, stopwordpattern)

        # calculate individual word scores（根据词频freq、度deg计算每个单词的得分）
        wordscores = self.calculate_word_scores(phraseList)


        # generate candidate keyword scores（根据每个词的得分计算词组的得分）
        keywordcandidates = self.generate_candidate_keyword_scores(phraseList, wordscores)

        #对结果排序
        sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)

        return sortedKeywords

    def index(self):

        str = "The Chinese mainland's popular hot pot chain Haidilao will stand at a listing hearing for formal approval of an initial public offering on the Hong Kong stock exchange next Thursday, Beijing Business Today reported."
        # 从标题提取出来的keyword
        title_keyword_list = self.get_keyword(str)

        print(title_keyword_list)
        
        exit()
        #结果：[('popular hot pot chain haidilao', 25.0), ('hong kong stock exchange', 16.0), ('beijing business today reported', 16.0), ('initial public offering', 9.0), ('chinese mainland', 4.0), ('listing hearing', 4.0), ('formal approval', 4.0), ('stand', 1.0), ('thursday', 1.0)]


if __name__ == '__main__':
    rake = rake()

    rake.index()

SmartStoplist.txt为停用词表格式如下

#stop word list from SMART (Salton,1971).  Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
a
a's
able
about
above
according
accordingly
across
actually
after
afterwards
again
against
ain't
all
allow
allows
almost
alone
along
already
also
although
always
am
.....

一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30