通过影片id获取豆瓣影片信息
通过豆瓣的影片id获取详情 演职人员 海报等信息接口
放到本地直接python3 运行
访问形式:http://1270.0.0.1:8888/do_GET?douban_id=33404601
使用了代理ip和超时处理(timeout-decorator 模块)
#--coding:utf-8-- from http.server import HTTPServer, BaseHTTPRequestHandler import io, shutil,urllib import requests from bs4 import BeautifulSoup import re import json import random import timeout_decorator #跑列表 class MyHttpHandler(BaseHTTPRequestHandler): def do_GET(self): self.proxies = {'https': 'https://180.109.127.173:51946'} self.proxiesAll = [] #处理参数 params = "" if '?' in self.path: # 如果带有参数 self.queryString = urllib.parse.unquote(self.path.split('?', 1)[1]) # name=str(bytes(params['name'][0],'GBK'),'utf-8') params = urllib.parse.parse_qs(self.queryString) if params != "" and "douban_id" in params.keys(): flag = 0 try: douban_id = int(params["douban_id"][0]) except: # 参数错误 flag = 1 r_str = self.dealCode(-5) # 获取数据 if flag == 0: r_str = self.main(douban_id) else: # 参数错误 r_str = self.dealCode(-5) self.time_out = 1 f = io.BytesIO() f.write(json.dumps(r_str).encode()) f.seek(0) self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header("Content-Length", str(len(json.dumps(r_str).encode()))) self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() shutil.copyfileobj(f, self.wfile) #获取代理ip def getIp(self, type): if len(self.proxiesAll) == 0: r = requests.get("http://api.shenlongip.com/ip?key=XXXXXXX&pattern=json&count=1&need=1000&protocol=2&area=220500,220400,371200,371600,370900,440300,421100,420100,211200") r = r.json() if r["code"] != 200 or len(r["data"]) == 0: return -2 for dd in r["data"]: self.proxiesAll.append(str(dd["ip"]) + ":" + str(dd["port"])) return self.proxiesAll[-1] elif type == 1: random.shuffle(self.proxiesAll) return self.proxiesAll[-1] elif type == 2: random.shuffle(self.proxiesAll) return self.proxiesAll.pop() # 获取指定数据 def getDomData(self, soup,rule,value): try: data = re.findall(rule, str(soup))[0] except: data = value return data #获取页面数据 def getHtmlResponse(self, url): ii = 0 while 1: headers = { 'Accept-Encoding': '', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'https://movie.douban.com/tag/', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } try: response = requests.get(url, headers=headers,proxies=self.proxies,timeout = 6) # response = requests.get(url, headers=headers, timeout=6) if "测到有异常" in response.text or "navigator.platform,navigator.userAgent,navigator.vendor" in response.text: print(response.text) # print("有异常") # os._exit(1) ii += 1 if ii == 10: proxiesIp = self.getIp(2) if proxiesIp == -2: return -2 self.proxies["https"] = "https://" + proxiesIp ii = 0 else: proxiesIp = self.getIp(2) if proxiesIp == -2: return -2 self.proxies["https"] = "https://" + proxiesIp continue if "页面不存在" in response.text or "条目不存在" in response.text: return -1 except Exception as ret: proxiesIp = self.getIp(2) if proxiesIp == -2:return -2 if "Timed Out" in str(ret): return -6 self.proxies["https"] = "https://" + proxiesIp continue break return response def dealCode(self,dataDetails): if dataDetails == -1: return {"msg":"对应数据不存在","status":-1,"data":[]} if dataDetails == -2: return {"msg":"IP被提取光了","status":-2,"data":[]} if dataDetails == -3: return {"msg":"解析script失败","status":-3,"data":[]} if dataDetails == -4: return {"msg":"解析json失败","status":-4,"data":[]} if dataDetails == -5: return {"msg": "参数错误", "status": -5, "data": []} if dataDetails == -6: return {"msg": "请求超时请重试", "status": -6, "data": []} return dataDetails @timeout_decorator.timeout(15) def main(self,douban_id): #每次请求获取几个代理ip 后面请求代理id proxiesIp = self.getIp(1) if proxiesIp == -2: return {"msg":"IP被提取光了","status":-2,"data":[]} self.proxies["https"] = "https://" + proxiesIp # douban_id = 34809114 #获取详情数据 dataDetails = self.dataDetails(douban_id) dataDetails = self.dealCode(dataDetails) if "status" in dataDetails.keys() and dataDetails["status"] in [-1,-2,-3,-4,-6]:return dataDetails # 演职人员数据 dataCelebrity = self.dataCelebrity(douban_id) dataCelebrity = self.dealCode(dataCelebrity) if "status" in dataDetails.keys() and dataDetails["status"] in [-1, -2, -3, -4,-6]: return dataCelebrity # 海报数据 dataTrailer = self.dataTrailer(dataDetails["trailer_url"]) dataTrailer = self.dealCode(dataTrailer) if "status" in dataDetails.keys() and dataDetails["status"] in [-1, -2, -3, -4,-6]: return dataTrailer data = {"details":dataDetails,"celebrity":dataCelebrity,"trailer":dataTrailer} return {"msg":"获取成功","status":200,"data":data} #获取详情数据 start def dataDetails(self,douban_id): url='https://movie.douban.com/subject/{}'.format(douban_id) # 获取该页面的html代码 response = self.getHtmlResponse(url) if response == -1:return -1 if response == -2: return -2 if response == -6: return -6 # 获取对应的数据 detailData = self.dealDataDetails(response) return detailData def dealDataDetails(self,response): Soup = BeautifulSoup(response.text,"html.parser") details = { "on_year":0, "on_date": "", "name_en": "", "name_alias": "", "subject_id":0 } try: jsonData = re.findall(r'<script type="application/ld\+json">([\s\S]*?)</script>', str(Soup))[0] except: return -3 try: minData = Soup.select('span[property="v:runtime"]')[0] min = minData.get("content") except: min = 0 try: jsonData = json.loads(jsonData.replace("\\","").replace("\n","").replace("\r","").replace(" "," ")) except Exception as e: return -4 details["name_alias"] = self.getDomData(Soup, r'又名:</span> (.*?)<',"").replace("'","") details["country"] = self.getDomData(Soup, r'地区:</span> (.*?)<',"") details["language"] = self.getDomData(Soup, r'语言:</span> (.*?)<',"") details["imdbid"] = self.getDomData(Soup, r'IMDb链接:</span> <a href="(.*?)"',"").split("/")[-1] details["tags"] = Soup.select('span[property="v:genre"]') details["on_date"] = jsonData["datePublished"] if "datePublished" in jsonData.keys() else "" details["name_zh"] = jsonData["name"] if "name" in jsonData.keys() else "" details["on_year"] = details["on_date"].split("-")[0] details["name_en"] = jsonData["name"].replace("'","") if "name" in jsonData.keys() else "" details["subject_id"] = jsonData["url"].split("/")[2] if "url" in jsonData.keys() else 0 details["rank"] = jsonData["aggregateRating"]["ratingValue"] if "aggregateRating" in jsonData.keys() else 0 details["min"] = min details["lcover"] = jsonData["image"] if "image" in jsonData.keys() else "" details["summary"] = jsonData["description"].replace("'","") if "description" in jsonData.keys() else "" try: details["trailer_url"] = Soup.select(".related-pic-video")[0].get("href") except: details["trailer_url"] = "" #把没有评分的给0 if details["rank"] == "": details["rank"] = 0 #处理视频类型标签 if len(details["tags"]) != 0: tempArray = [] for tag in details["tags"]: tempArray.append(tag.get_text()) details["tags"] = tempArray #如果json里面没有年份 就从html里面获取 if details["on_year"] == "": try: details["on_year"] = Soup.select('span[property="v:initialReleaseDate"]')[0].get_text()[0:4] except: details["on_year"] = 0 #去除中文 # strinfo = re.compile("[\u4e00-\u9fa5]+") # details["name_en"] = strinfo.sub('', details["name_en"]).strip() # # details["name_zh"] = details["name_zh"].replace(details["name_en"],"").strip() stills = self.getStills(Soup) details["stills"] = stills details["type"] = jsonData["@type"] if "@type" in jsonData.keys() else "" return details #获取剧照图 def getStills(self,Soup): imgs = Soup.select("#related-pic img") stills = "" for img in imgs: stills += "," + img.get("src") return stills.strip(",").split(",") # 获取详情数据 end # 获取演职人员数据start def dataCelebrity(self, douban_id): url = 'https://movie.douban.com/subject/{}/celebrities'.format(douban_id) #获取该页面的html代码 response = self.getHtmlResponse(url) if response == -1: return [] if response == -2: return -2 if response == -6: return -6 # 获取演职人员数据 detailData = self.dealDataCelebrity(response) return detailData #处理演职人员数据 def dealDataCelebrity(self,response): Soup = BeautifulSoup(response.text, "html.parser") list = Soup.select("#celebrities > .list-wrapper") if len(list) == 0: return [] direcotr = self.getCelebrity(list,"导演") actor = self.getCelebrity(list, "演员") writer = self.getCelebrity(list, "编剧") return {"direcotr":direcotr,"actor":actor,"writer":writer} # 获取演员数据 def getCelebrity(self,lists,category): data = [] temp_celebritys = "" for list in lists: #如果没有这个字段 默认没有数据 try: or_category = list.select("h2")[0].get_text() except: return [] if category in or_category: temp_celebritys = list break #如果没有对应的类型 返回空 if temp_celebritys == "": return [] celebritys = temp_celebritys.select("li.celebrity") if len(celebritys) == 0: return [] for celebrity in celebritys: temp_data = {} temp_data["category"] = category try: temp_data["name"] = celebrity.select("a")[0]["title"] except: temp_data["name"] = "" continue try: avatar = celebrity.select("div.avatar")[0]["style"] avatar = re.findall(r"url\((.*?)\)", avatar) temp_data["avatar"] = avatar[0] except: temp_data["avatar"] = "" data.append(temp_data) return data # 获取演职人员数据end # 获取预告片数据start def dataTrailer(self, url): if url == "": return {"mp4_url":"","lcover":""} # 获取该页面的html代码 response = self.getHtmlResponse(url) if response == -1: return [] if response == -2: return -2 if response == -6: return -6 # 获取演职人员数据 detailData = self.dealTrailer(response) return detailData # 处理预告片数据 def dealTrailer(self, response): Soup = BeautifulSoup(response.text, "html.parser") details = {} try: jsonData = re.findall(r'<script type="application/ld\+json">([\s\S]*?)</script>', str(Soup))[0] jsonData = json.loads(jsonData.replace("\\","").replace("\n","").replace("\r","").replace(" "," ")) except: return [] details["mp4_url"] = jsonData["embedUrl"] if "embedUrl" in jsonData.keys() else "" details["lcover"] = jsonData["thumbnailUrl"] if "thumbnailUrl" in jsonData.keys() else "" return details if __name__ == '__main__': server = HTTPServer(('',8888),MyHttpHandler) print("Starting server, listen at: 8888") server.serve_forever() # baidu.newToOld(4)
返回数据格式
{ "msg": "获取成功", "status": 200, "data": { "details": { "on_year": "2020", "on_date": "2020-11-11", "name_en": "一日成交", "name_alias": "A Day Deal", "subject_id": "33404601", "country": "中国大陆", "language": "汉语普通话", "imdbid": "", "tags": [ "剧情" ], "name_zh": "一日成交", "rank": 0, "min": "93", "lcover": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2623292918.webp", "summary": "一天之内一群人围绕古玩发生的一连串啼笑皆非的故事。", "trailer_url": "", "stills": [ "https://img9.doubanio.com/view/photo/sqxs/public/p2623323446.webp", "https://img3.doubanio.com/view/photo/sqxs/public/p2624362500.webp", "https://img3.doubanio.com/view/photo/sqxs/public/p2625179011.webp", "https://img3.doubanio.com/view/photo/sqxs/public/p2625179010.webp", "https://img1.doubanio.com/view/photo/sqxs/public/p2625179009.webp" ], "type": "Movie" }, "celebrity": { "direcotr": [ { "category": "导演", "name": "王挺 Ting Wang", "avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp" } ], "actor": [ { "category": "演员", "name": "刘小宁 Xiaoning Liu", "avatar": "https://img1.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24447.webp" }, { "category": "演员", "name": "郎月婷 Yueting Lang", "avatar": "https://img1.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1372053646.57.webp" }, { "category": "演员", "name": "王挺 Ting Wang", "avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp" }, { "category": "演员", "name": "钱冬旎 Dongni Qian", "avatar": "https://img2.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1523418793.43.webp" }, { "category": "演员", "name": "倪大红 Dahong Ni", "avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1368597792.05.webp" }, { "category": "演员", "name": "温峥嵘 Zhengrong Wen", "avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p7935.webp" }, { "category": "演员", "name": "许文广 Wenguang Xu", "avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1452816153.34.webp" }, { "category": "演员", "name": "曹操 Jonathan Kos-Read", "avatar": "https://img9.doubanio.com/view/celebrity/s_ratio_celebrity/public/p1381557337.34.webp" } ], "writer": [ { "category": "编剧", "name": "王挺 Ting Wang", "avatar": "https://img3.doubanio.com/view/celebrity/s_ratio_celebrity/public/p24410.webp" } ] }, "trailer": { "mp4_url": "", "lcover": "" } } }