python爬取贴吧数据并且分析关键词生成词云
代码如下:
import requests from bs4 import BeautifulSoup import jieba import sys import jieba.analyse from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator, STOPWORDS class tieba(): def __init__(self): self.url = "http://tieba.baidu.com/f?kw=%E4%BB%81%E5%AF%BF&ie=utf-8&pn={}" self.txt = "./data.txt" self.fontPath = "fangsong.ttf" def do(self): for i in range(0,10): url = self.url.format(i) content = requests.get(url).text Soup = BeautifulSoup(content, from_encoding='utf-8') div = Soup.select_one('#thread_list') ass = div.select('div.threadlist_title > a.j_th_tit') self.formData(ass) def formData(self,data): f = open(self.txt,'a',encoding='gbk', errors='ignore') for a in data: f.write(a['title']+'\n') def delStopWord(self,data=1): #停用词文档网上一大堆复制下来换行符隔开 stopWordPath = "stopword.txt" r1 = open(stopWordPath,'r',encoding='gbk', errors='ignore') strf = r1.read() stopWordData = strf.split('\n') temp = [] for dd in data: dd = list(dd) if dd[0] == '龙滩': dd[0] = '黑龙滩' if dd[0] not in stopWordData: temp.append(dd) return temp def listToDict(self,result): keywords = dict() for i in result: keywords[i[0]]=i[1] return keywords def analysis(self): f = open(self.txt,'r', encoding='gbk',errors='ignore') text = f.read() #提取每个词的权重 result=jieba.analyse.extract_tags(text,topK=200,withWeight=True) #删除停用词 result = self.delStopWord(result) #list转为字典格式 keywords = self.listToDict(result) #配置词云的参数,这里的test.png是词云的形状,网上随便找一个图片即可,也可以使用默认的,font_path是中文字体去下载即可 graph = np.array(Image.open("test.png")) wc = WordCloud(font_path=self.fontPath,background_color='White',max_words=200,mask=graph) wc.generate_from_frequencies(keywords) image_color = ImageColorGenerator(graph) plt.figure() plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_color)) plt.axis("off") plt.show() # wc.to_file('dream.png') if __name__=='__main__': tieba = tieba() #爬取数据 # tieba.do() #分析关键字 tieba.analysis()