#驱动和谷歌浏览器 版本要一致
import requests
import pymysql
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import json
class Pc():
def __init__(self):
self.headers = {
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
}
self.host = "127.0.0.1"
self.port = 3306
self.user = "xxx"
self.pwd = "xxx"
self.db = "xxx"
# 连接数据库
def get_conn(self):
conn = pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db)
return conn
# 批量插入数据
def insertIntoMysqlMut(self, sql, data):
conn = self.get_conn()
try:
with conn.cursor() as cursor:
cursor.executemany(sql, data)
conn.commit()
except:
raise
conn.rollback()
finally:
conn.close()
def update(self, sql):
conn = self.get_conn()
try:
with conn.cursor() as cursor:
cursor.execute(sql)
conn.commit()
except:
raise
conn.rollback()
finally:
conn.close()
def getNum(self):
conn = self.get_conn()
cursor = conn.cursor()
try:
sql = 'select num from soopat1 where nums = "" limit 100'
count = cursor.execute(sql)
result = cursor.fetchall()
return result
conn.commit()
except:
print("query ERROR.")
conn.rollback()
def getContent(self,text,num):
allData = {
"title":"",
"inventor":"",
"apply_person":"",
"category_num":"",
"nums":"",
"legalArray":""
}
legalArray = []
Soup = BeautifulSoup(text,"html.parser")
try:
allData["title"] = Soup.select('meta[name="DC.title"]')[0]["content"]
except:
pass
try:
invertors = Soup.select('meta[name="DC.contributor"]')
except:
pass
try:
category_nums = Soup.select("#classifications state-modifier #link")
except:
pass
try:
allData["nums"] = Soup.select('div[class="header style-scope application-timeline"]')[1].text
except:
pass
try:
legals = Soup.select('div[class="event layout horizontal style-scope application-timeline"]')
except:
pass
for inver in invertors:
if inver["scheme"] == "inventor":
allData["inventor"] += inver["content"] + ","
if inver["scheme"] == "assignee":
allData["apply_person"] = inver["content"]
for category in category_nums:
tt = category.text
if len(tt) < 7:
continue
allData["category_num"] += tt + ","
for legal in legals:
tempData = {
"key":"",
"value":""
}
try:
tempData["key"] = legal.select("div")[0].text
tempData["value"] = legal.select("div")[2].text
except:
pass
legalArray.append(tempData)
allData["legalArray"] = legalArray
allData["nums"] = allData["nums"].split(" ")[1]
return allData
# sql = "insert into soopat1(name,num,page,keyword) values(%s,%s,%s,%s)"
# self.insertIntoMysqlMut(sql,allData)
def transferContent(self, content):
if content is None:
return None
else:
string = ""
for c in content:
if c == '"':
string += '\\\"'
elif c == "'":
string += "\\\'"
elif c == "\\":
string += "\\\\"
else:
string += c
return string
def main(self):
while 1:
data = self.getNum()
if len(data) == 0:
break
for dd in data:
# time.sleep(5)
keyword = "Armyris"
num = dd[0]
urlParams = num + "/?="+keyword+"&oq="+keyword
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\chrome\Chrome\App\chrome.exe'
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
client = webdriver.Chrome(executable_path='C:/Users/readline/Desktop/xutao/python/chromedriver.exe',
chrome_options=chrome_options)
# 如果没有把chromedriver加入到PATH中,就需要指明路径 executable_path='/home/chromedriver'
# client.get("https://patents.glgoo.top/patent/"+urlParams)
client.get("https://patents.glgoo.top/patent/"+urlParams)
# 获取页面所有内容
content = client.page_source
client.close() # 关闭连接
ddd = self.getContent(content,num)
print(ddd)
sql = "update xxx set new_name='{}',person='{}',apply_person='{}',category_num='{}',nums='{}',legal='{}' where num = '{}'".format(
self.transferContent(ddd["title"]),self.transferContent(ddd["inventor"]),self.transferContent(ddd["apply_person"]),self.transferContent(ddd["category_num"]),self.transferContent(ddd["nums"]),self.transferContent(json.dumps(ddd["legalArray"])),num)
print(sql)
# 更新数据库
self.update(sql)
if __name__ == '__main__':
Pc = Pc()
Pc.main()