python爬取贴吧数据并且分析关键词生成词云
代码如下:
import requests
from bs4 import BeautifulSoup
import jieba
import sys
import jieba.analyse
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator, STOPWORDS
class tieba():
def __init__(self):
self.url = "http://tieba.baidu.com/f?kw=%E4%BB%81%E5%AF%BF&ie=utf-8&pn={}"
self.txt = "./data.txt"
self.fontPath = "fangsong.ttf"
def do(self):
for i in range(0,10):
url = self.url.format(i)
content = requests.get(url).text
Soup = BeautifulSoup(content, from_encoding='utf-8')
div = Soup.select_one('#thread_list')
ass = div.select('div.threadlist_title > a.j_th_tit')
self.formData(ass)
def formData(self,data):
f = open(self.txt,'a',encoding='gbk', errors='ignore')
for a in data:
f.write(a['title']+'\n')
def delStopWord(self,data=1):
#停用词文档网上一大堆复制下来换行符隔开
stopWordPath = "stopword.txt"
r1 = open(stopWordPath,'r',encoding='gbk', errors='ignore')
strf = r1.read()
stopWordData = strf.split('\n')
temp = []
for dd in data:
dd = list(dd)
if dd[0] == '龙滩':
dd[0] = '黑龙滩'
if dd[0] not in stopWordData:
temp.append(dd)
return temp
def listToDict(self,result):
keywords = dict()
for i in result:
keywords[i[0]]=i[1]
return keywords
def analysis(self):
f = open(self.txt,'r', encoding='gbk',errors='ignore')
text = f.read()
#提取每个词的权重
result=jieba.analyse.extract_tags(text,topK=200,withWeight=True)
#删除停用词
result = self.delStopWord(result)
#list转为字典格式
keywords = self.listToDict(result)
#配置词云的参数,这里的test.png是词云的形状,网上随便找一个图片即可,也可以使用默认的,font_path是中文字体去下载即可
graph = np.array(Image.open("test.png"))
wc = WordCloud(font_path=self.fontPath,background_color='White',max_words=200,mask=graph)
wc.generate_from_frequencies(keywords)
image_color = ImageColorGenerator(graph)
plt.figure()
plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis("off")
plt.show()
# wc.to_file('dream.png')
if __name__=='__main__':
tieba = tieba()
#爬取数据
# tieba.do()
#分析关键字
tieba.analysis()