通过影片id获取豆瓣影片信息
通过豆瓣的影片id获取详情 演职人员 海报等信息接口
放到本地直接python3 运行
访问形式:http://1270.0.0.1:8888/do_GET?douban_id=33404601
使用了代理ip和超时处理(timeout-decorator 模块)
#--coding:utf-8--
from http.server import HTTPServer, BaseHTTPRequestHandler
import io, shutil,urllib
import requests
from bs4 import BeautifulSoup
import re
import json
import random
import timeout_decorator
#跑列表
class MyHttpHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.proxies = {'https': 'https://180.109.127.173:51946'}
self.proxiesAll = []
#处理参数
params = ""
if '?' in self.path: # 如果带有参数
self.queryString = urllib.parse.unquote(self.path.split('?', 1)[1])
# name=str(bytes(params['name'][0],'GBK'),'utf-8')
params = urllib.parse.parse_qs(self.queryString)
if params != "" and "douban_id" in params.keys():
flag = 0
try:
douban_id = int(params["douban_id"][0])
except:
# 参数错误
flag = 1
r_str = self.dealCode(-5)
# 获取数据
if flag == 0:
r_str = self.main(douban_id)
else:
# 参数错误
r_str = self.dealCode(-5)
self.time_out = 1
f = io.BytesIO()
f.write(json.dumps(r_str).encode())
f.seek(0)
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.send_header("Content-Length", str(len(json.dumps(r_str).encode())))
self.send_header('Access-Control-Allow-Origin', '*')
self.end_headers()
shutil.copyfileobj(f, self.wfile)
#获取代理ip
def getIp(self, type):
if len(self.proxiesAll) == 0:
r = requests.get("http://api.shenlongip.com/ip?key=XXXXXXX&pattern=json&count=1&need=1000&protocol=2&area=220500,220400,371200,371600,370900,440300,421100,420100,211200")
r = r.json()
if r["code"] != 200 or len(r["data"]) == 0:
return -2
for dd in r["data"]:
self.proxiesAll.append(str(dd["ip"]) + ":" + str(dd["port"]))
return self.proxiesAll[-1]
elif type == 1:
random.shuffle(self.proxiesAll)
return self.proxiesAll[-1]
elif type == 2:
random.shuffle(self.proxiesAll)
return self.proxiesAll.pop()
# 获取指定数据
def getDomData(self, soup,rule,value):
try:
data = re.findall(rule, str(soup))[0]
except:
data = value
return data
#获取页面数据
def getHtmlResponse(self, url):
ii = 0
while 1:
headers = {
'Accept-Encoding': '',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'https://movie.douban.com/tag/',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
try:
response = requests.get(url, headers=headers,proxies=self.proxies,timeout = 6)
# response = requests.get(url, headers=headers, timeout=6)
if "测到有异常" in response.text or "navigator.platform,navigator.userAgent,navigator.vendor" in response.text:
print(response.text)
# print("有异常")
# os._exit(1)
ii += 1
if ii == 10:
proxiesIp = self.getIp(2)
if proxiesIp == -2: return -2
self.proxies["https"] = "https://" + proxiesIp
ii = 0
else:
proxiesIp = self.getIp(2)
if proxiesIp == -2: return -2
self.proxies["https"] = "https://" + proxiesIp
continue
if "页面不存在" in response.text or "条目不存在" in response.text:
return -1
except Exception as ret:
proxiesIp = self.getIp(2)
if proxiesIp == -2:return -2
if "Timed Out" in str(ret):
return -6
self.proxies["https"] = "https://" + proxiesIp
continue
break
return response
def dealCode(self,dataDetails):
if dataDetails == -1: return {"msg":"对应数据不存在","status":-1,"data":[]}
if dataDetails == -2: return {"msg":"IP被提取光了","status":-2,"data":[]}
if dataDetails == -3: return {"msg":"解析script失败","status":-3,"data":[]}
if dataDetails == -4: return {"msg":"解析json失败","status":-4,"data":[]}
if dataDetails == -5: return {"msg": "参数错误", "status": -5, "data": []}
if dataDetails == -6: return {"msg": "请求超时请重试", "status": -6, "data": []}
return dataDetails
@timeout_decorator.timeout(15)
def main(self,douban_id):
#每次请求获取几个代理ip 后面请求代理id
proxiesIp = self.getIp(1)
if proxiesIp == -2:
return {"msg":"IP被提取光了","status":-2,"data":[]}
self.proxies["https"] = "https://" + proxiesIp
# douban_id = 34809114
#获取详情数据
dataDetails = self.dataDetails(douban_id)
dataDetails = self.dealCode(dataDetails)
if "status" in dataDetails.keys() and dataDetails["status"] in [-1,-2,-3,-4,-6]:return dataDetails
# 演职人员数据
dataCelebrity = self.dataCelebrity(douban_id)
dataCelebrity = self.dealCode(dataCelebrity)
if "status" in dataDetails.keys() and dataDetails["status"] in [-1, -2, -3, -4,-6]: return dataCelebrity
# 海报数据
dataTrailer = self.dataTrailer(dataDetails["trailer_url"])
dataTrailer = self.dealCode(dataTrailer)
if "status" in dataDetails.keys() and dataDetails["status"] in [-1, -2, -3, -4,-6]: return dataTrailer
data = {"details":dataDetails,"celebrity":dataCelebrity,"trailer":dataTrailer}
return {"msg":"获取成功","status":200,"data":data}
#获取详情数据 start
def dataDetails(self,douban_id):
url='https://movie.douban.com/subject/{}'.format(douban_id)
# 获取该页面的html代码
response = self.getHtmlResponse(url)
if response == -1:return -1
if response == -2: return -2
if response == -6: return -6
# 获取对应的数据
detailData = self.dealDataDetails(response)
return detailData
def dealDataDetails(self,response):
Soup = BeautifulSoup(response.text,"html.parser")
details = {
"on_year":0,
"on_date": "",
"name_en": "",
"name_alias": "",
"subject_id":0
}
try:
jsonData = re.findall(r'<script type="application/ld\+json">([\s\S]*?)</script>', str(Soup))[0]
except:
return -3
try:
minData = Soup.select('span[property="v:runtime"]')[0]
min = minData.get("content")
except:
min = 0
try:
jsonData = json.loads(jsonData.replace("\\","").replace("\n","").replace("\r","").replace(" "," "))
except Exception as e:
return -4
details["name_alias"] = self.getDomData(Soup, r'又名:</span> (.*?)<',"").replace("'","")
details["country"] = self.getDomData(Soup, r'地区:</span> (.*?)<',"")
details["language"] = self.getDomData(Soup, r'语言:</span> (.*?)<',"")
details["imdbid"] = self.getDomData(Soup, r'IMDb链接:</span> <a href="(.*?)"',"").split("/")[-1]
details["tags"] = Soup.select('span[property="v:genre"]')
details["on_date"] = jsonData["datePublished"] if "datePublished" in jsonData.keys() else ""
details["name_zh"] = jsonData["name"] if "name" in jsonData.keys() else ""
details["on_year"] = details["on_date"].split("-")[0]
details["name_en"] = jsonData["name"].replace("'","") if "name" in jsonData.keys() else ""
details["subject_id"] = jsonData["url"].split("/")[2] if "url" in jsonData.keys() else 0
details["rank"] = jsonData["aggregateRating"]["ratingValue"] if "aggregateRating" in jsonData.keys() else 0
details["min"] = min
details["lcover"] = jsonData["image"] if "image" in jsonData.keys() else ""
details["summary"] = jsonData["description"].replace("'","") if "description" in jsonData.keys() else ""
try:
details["trailer_url"] = Soup.select(".related-pic-video")[0].get("href")
except:
details["trailer_url"] = ""
#把没有评分的给0
if details["rank"] == "":
details["rank"] = 0
#处理视频类型标签
if len(details["tags"]) != 0:
tempArray = []
for tag in details["tags"]:
tempArray.append(tag.get_text())
details["tags"] = tempArray
#如果json里面没有年份 就从html里面获取
if details["on_year"] == "":
try:
details["on_year"] = Soup.select('span[property="v:initialReleaseDate"]')[0].get_text()[0:4]
except:
details["on_year"] = 0
#去除中文
# strinfo = re.compile("[\u4e00-\u9fa5]+")
# details["name_en"] = strinfo.sub('', details["name_en"]).strip()
#
# details["name_zh"] = details["name_zh"].replace(details["name_en"],"").strip()
stills = self.getStills(Soup)
details["stills"] = stills
details["type"] = jsonData["@type"] if "@type" in jsonData.keys() else ""
return details
#获取剧照图
def getStills(self,Soup):
imgs = Soup.select("#related-pic img")
stills = ""
for img in imgs:
stills += "," + img.get("src")
return stills.strip(",").split(",")
# 获取详情数据 end
# 获取演职人员数据start
def dataCelebrity(self, douban_id):
url = 'https://movie.douban.com/subject/{}/celebrities'.format(douban_id)
#获取该页面的html代码
response = self.getHtmlResponse(url)
if response == -1: return []
if response == -2: return -2
if response == -6: return -6
# 获取演职人员数据
detailData = self.dealDataCelebrity(response)
return detailData
#处理演职人员数据
def dealDataCelebrity(self,response):
Soup = BeautifulSoup(response.text, "html.parser")
list = Soup.select("#celebrities > .list-wrapper")
if len(list) == 0:
return []
direcotr = self.getCelebrity(list,"导演")
actor = self.getCelebrity(list, "演员")
writer = self.getCelebrity(list, "编剧")
return {"direcotr":direcotr,"actor":actor,"writer":writer}
# 获取演员数据
def getCelebrity(self,lists,category):
data = []
temp_celebritys = ""
for list in lists:
#如果没有这个字段 默认没有数据
try:
or_category = list.select("h2")[0].get_text()
except:
return []
if category in or_category:
temp_celebritys = list
break
#如果没有对应的类型 返回空
if temp_celebritys == "":
return []
celebritys = temp_celebritys.select("li.celebrity")
if len(celebritys) == 0:
return []
for celebrity in celebritys:
temp_data = {}
temp_data["category"] = category
try:
temp_data["name"] = celebrity.select("a")[0]["title"]
except:
temp_data["name"] = ""
continue
try:
avatar = celebrity.select("div.avatar")[0]["style"]
avatar = re.findall(r"url\((.*?)\)", avatar)
temp_data["avatar"] = avatar[0]
except:
temp_data["avatar"] = ""
data.append(temp_data)
return data
# 获取演职人员数据end
# 获取预告片数据start
def dataTrailer(self, url):
if url == "":
return {"mp4_url":"","lcover":""}
# 获取该页面的html代码
response = self.getHtmlResponse(url)
if response == -1: return []
if response == -2: return -2
if response == -6: return -6
# 获取演职人员数据
detailData = self.dealTrailer(response)
return detailData
# 处理预告片数据
def dealTrailer(self, response):
Soup = BeautifulSoup(response.text, "html.parser")
details = {}
try:
jsonData = re.findall(r'<script type="application/ld\+json">([\s\S]*?)</script>', str(Soup))[0]
jsonData = json.loads(jsonData.replace("\\","").replace("\n","").replace("\r","").replace(" "," "))
except:
return []
details["mp4_url"] = jsonData["embedUrl"] if "embedUrl" in jsonData.keys() else ""
details["lcover"] = jsonData["thumbnailUrl"] if "thumbnailUrl" in jsonData.keys() else ""
return details
if __name__ == '__main__':
server = HTTPServer(('',8888),MyHttpHandler)
print("Starting server, listen at: 8888")
server.serve_forever()
# baidu.newToOld(4)返回数据格式
{
"msg": "获取成功",
"status": 200,
"data": {
"details": {
"on_year": "2020",
"on_date": "2020-11-11",
"name_en": "一日成交",
"name_alias": "A Day Deal",
"subject_id": "33404601",
"country": "中国大陆",
"language": "汉语普通话",
"imdbid": "",
"tags": [
"剧情"
],
"name_zh": "一日成交",
"rank": 0,
"min": "93",
"lcover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2623292918.webp",
"summary": "一天之内一群人围绕古玩发生的一连串啼笑皆非的故事。",
"trailer_url": "",
"stills": [
"https://img9.doubanio.com/view/photo/sqxs/public/p2623323446.webp",
"https://img3.doubanio.com/view/photo/sqxs/public/p2624362500.webp",
"https://img3.doubanio.com/view/photo/sqxs/public/p2625179011.webp",
"https://img3.doubanio.com/view/photo/sqxs/public/p2625179010.webp",
"https://img1.doubanio.com/view/photo/sqxs/public/p2625179009.webp"
],
"type": "Movie"
},
"celebrity": {
"direcotr": [
{
"category": "导演",
"name": "王挺 Ting Wang",
"avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp"
}
],
"actor": [
{
"category": "演员",
"name": "刘小宁 Xiaoning Liu",
"avatar": "https://img1.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24447.webp"
},
{
"category": "演员",
"name": "郎月婷 Yueting Lang",
"avatar": "https://img1.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1372053646.57.webp"
},
{
"category": "演员",
"name": "王挺 Ting Wang",
"avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp"
},
{
"category": "演员",
"name": "钱冬旎 Dongni Qian",
"avatar": "https://img2.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1523418793.43.webp"
},
{
"category": "演员",
"name": "倪大红 Dahong Ni",
"avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1368597792.05.webp"
},
{
"category": "演员",
"name": "温峥嵘 Zhengrong Wen",
"avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p7935.webp"
},
{
"category": "演员",
"name": "许文广 Wenguang Xu",
"avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1452816153.34.webp"
},
{
"category": "演员",
"name": "曹操 Jonathan Kos-Read",
"avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1381557337.34.webp"
}
],
"writer": [
{
"category": "编剧",
"name": "王挺 Ting Wang",
"avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp"
}
]
},
"trailer": {
"mp4_url": "",
"lcover": ""
}
}
}