import requests
import threading
import re
import queue
import time
from bs4 import BeautifulSoup
import sys
class Spider():
data = []
def getType(self):
proxies = {"http": "http://127.0.0.1:8888", "https": "http://10.10.1.10:1080", }
content = requests.get('http://so.gushiwen.org/gushi/tangshi.aspx', proxies=proxies).text
Soup = BeautifulSoup(content, from_encoding='utf-8')
# 获取到需要的类型
dd = (Soup.select('.cont')[1].select('a'))
del dd[-1]
data = []
for i in range(len(dd)):
temp = []
temp.append("http://so.gushiwen.org/" + dd[i]['href'])
temp.append(dd[i].getText())
data.append(temp)
return data
def run(self,data):
self.data = data
result = []
for i in range(len(self.data)):
#只跑前三个
if i == 3:
break
url = self.data[i][0]
type = self.data[i][1]
nameList = self.getName(url)
urlList = self.formData(nameList)
if len(urlList) != 0:
result.append(urlList)
return result
def getName(self,url):
proxies = {"http": "http://127.0.0.1:8888", "https": "http://10.10.1.10:1080", }
content = requests.get(url, proxies=proxies).text
Soup = BeautifulSoup(content, from_encoding='utf-8')
return Soup.select('.sons')[0].select('span a')
def formData(self,data):
result = []
for i in range(len(data)):
try:
if 'so.gushiwen.org' not in data[i]['href']:
result.append('http://so.gushiwen.org'+data[i]['href'])
else:
continue
except:
continue
return result
class thread():
def dothread(self,data):
self.run(data)
def run(self,data):
urlList = data
self.getCotentS(urlList)
def getCotentS(self,list):
content = []
for i in range(len(list)):
url = list[i]
try:
content.append(self.doContent(url))
except:
continue
self.writeFile(content)
def writeFile(self,content):
file_object = open('C:\\Users\\Administrator.HD-20151127IFSG\\Desktop\\text.txt', 'a',encoding='utf-8')
for string in content:
file_object.writelines(string + '\n')
file_object.close()
def doContent(self,url):
proxies = {"http": "http://127.0.0.1:8888", "https": "http://10.10.1.10:1080", }
content = requests.get(url, proxies=proxies).text
Soup = BeautifulSoup(content, from_encoding='utf-8')
title = (Soup.select('h1'))[0].string
zt = Soup.select('.contson')[0].getText()
return title + "||" + zt
if __name__ == '__main__':
spider = Spider()
data = spider.getType()
data = spider.run(data)
thread = thread()
# for dd in data:
for dd in data:
thread1 = threading.Thread(target=thread.dothread,args=(dd,))
thread1.start()