多进程和多线程使用方法差不多,但做爬虫感觉还是多线程好一点。多进程多用在需要cpu计算
很大的方面。。。下面就是一个简单的例子
import multiprocessing
from bs4 import BeautifulSoup
import requests
#获取每页的文章链接
def pageUrls(url):
web_data = requests.get(url).text
soup = BeautifulSoup(web_data, 'lxml')
linkList = soup.select('article > header > h2 > a')
return linkList
def detailPage(myurl):
linkList = pageUrls(myurl)
data = []
for link in linkList:
temp = []
temp.append(link['href'])
temp.append(link.getText())
# content = getDetails(temp[0])
# temp.append(content)
data.append(temp)
return data
#获取文章详情
def getDetails(url):
web_data = requests.get(url).text
soup = BeautifulSoup(web_data, 'lxml')
return soup.select('article > section')[0]
#处理结果的回调函数
def doFrom(data):
print(data)
def main(urls):
#设置进程数
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
#Pool执行函数,当有一个进程执行完毕后,会添加一个新的进程到pool中
pool.apply_async(detailPage, (url, ),callback=doFrom)
pool.close()
#调用join之前,一定要先调用close() 函数,否则会出错, close()执行后不会有新的进程加入到pool,join函数等待素有子进程结束
pool.join()
if __name__ == "__main__":
urls = [
'http://phpindex.win/?page=1',
'http://phpindex.win/?page=2',
'http://phpindex.win/?page=3',
'http://phpindex.win/?page=4',
'http://phpindex.win/?page=5',
'http://phpindex.win/?page=6',
]
main(urls)