本文和上面一篇文章就是多线程的两个不同的启动方法
import requests
import threading
import re
import json
import queue
import time
import sys
import pymysql
class Spider():
def __init__(self):
self.header = {
'Host': 'c.y.qq.com',
'Referer':'https://y.qq.com/portal/playlist.html',
'Cookie': 'pgv_pvi=5542305792; RK=M7ODxkMrt7; tvfe_boss_uuid=f778171756061da6; cuid=2447167134; ptcz=4d215dcf0fe364a2c9af2872d38114ff36365671be1f10649bb3ea0f1c23b21a; pgv_pvid=4647908816; o_cookie=1430062586; pac_uid=1_1430062586; pt2gguin=o1430062586; ptui_loginuin=1430062586; sd_userid=60831516095629228; sd_cookie_crttime=1516095629228; pgv_info=ssid=s5788633701; pgv_si=s9584917504',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
}
self.url = [
'https://c.y.qq.com/splcloud/fcgi-bin/fcg_get_diss_by_tag.fcg?picmid=1&rnd=0.7388335059695934&g_tk=5381&jsonpCallback=getPlaylist&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&categoryId=10000000&sortId=5&sin={}&ein={}',
'https://c.y.qq.com/qzone/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid={}&format=jsonp&g_tk=5381&jsonpCallback=playlistinfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0',
'https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?g_tk=5381&jsonpCallback=MusicJsonCallback38981350824624017&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&cid=205361747&callback=MusicJsonCallback38981350824624017&uin=0&songmid={}&filename={}&guid=5807293828'
]
self.musicUrl = 'http://dl.stream.qqmusic.qq.com/{}?vkey={}&guid=5807293828&uin=0&fromtag=66'
def doCurl(self,start,end):
need = []
url = self.url[0].format(start,end)
typeList = self.getTypeList(url)
need.append(typeList[0]['dissname'])
musicList = self.getMusiList(typeList)
self.getMusicDetail(musicList,need)
#获得当前页的所有歌单
def getTypeList(self,url):
content = self.getRequest(url,self.header)
return self.loads_jsonp(content)['data']['list']
#获取当前页面歌单下面的所有歌曲,并且存数组全部返回
def getMusiList(self,typeList):
data = []
for dd in typeList:
dissid = dd['dissid']
url = self.url[1].format(dissid)
content = self.loads_jsonp(self.getRequest(url,self.header))
MusiList = content['cdlist'][0]['songlist']
data.append(MusiList)
return data
#获取歌曲详情(一次性获取当前分页的所有歌单的歌曲详情)
def getMusicDetail(self,MusicList,need):
data = []
for dd in MusicList:
for dds in dd:
temp = []
try:
songName = dds['songname']
songmid = dds['songmid']
except:
continue
filename = 'C400'+songmid+'.m4a'
url = self.url[2].format(songmid,filename)
detailsJson = self.loads_jsonp(self.getRequest(url,self.header))['data']['items'][0]
vkey = detailsJson['vkey']
musicUrl = self.musicUrl.format(filename,vkey)
temp.append(need[0])
temp.append(songName)
temp.append(musicUrl)
data.append(temp)
self.insertData(data)
#封装的get请求
def getRequest(self,url,header):
return requests.get(url, headers=header).text
#连接数据库
def connectSpider(self):#爬虫库
conn = pymysql.connect(host='XXX', user='XXX', passwd='XXX',
db='XXX', charset='utf8')
cur = conn.cursor()
return [conn, cur]
#插入数据库
def insertData(self,data):
conn, cur = self.connectSpider()
sql = "insert into one_dov(classify,title,musicUrl) values(%s,%s,%s)"
try:
cur.executemany(sql,data)
conn.commit()
except:
raise
cur.close()
conn.close()
#格式化jsonp
def loads_jsonp(self,_jsonp):
try:
return json.loads(re.match(".*?({.*}).*", _jsonp, re.S).group(1))
except:
raise ValueError('Invalid Input')
if __name__ == '__main__':
spider = Spider()
#总共爬取多少歌单
all = 100
#起始间隔
offset = 10
for i in range(int(all/offset)):
start = i * offset
end = start + 10
if start != 0:
start += 1
thread1 = threading.Thread(target=spider.doCurl,args=(start,end))
thread1.start()