澈-windows python3 selenium 谷歌无头浏览器爬取数据

2023年05月27日 14:06:17
windows python3 selenium 谷歌无头浏览器爬取数据

#驱动和谷歌浏览器 版本要一致
import requests
import pymysql
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import json

class Pc():
    def __init__(self):
        self.headers = {
            'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
            'sec-ch-ua-mobile': '?0',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        }



        self.host = "127.0.0.1"
        self.port = 3306
        self.user = "xxx"
        self.pwd = "xxx"
        self.db = "xxx"



    # 连接数据库
    def get_conn(self):
        conn = pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db)
        return conn

    # 批量插入数据
    def insertIntoMysqlMut(self, sql, data):

        conn = self.get_conn()

        try:
            with conn.cursor() as cursor:
                cursor.executemany(sql, data)
                conn.commit()
        except:
            raise
            conn.rollback()
        finally:
            conn.close()


    def update(self, sql):
        conn = self.get_conn()
        try:
            with conn.cursor() as cursor:
                cursor.execute(sql)
                conn.commit()
        except:
            raise
            conn.rollback()
        finally:
            conn.close()

    def getNum(self):
        conn = self.get_conn()
        cursor = conn.cursor()
        try:
            sql = 'select num from soopat1 where nums = "" limit 100'
            count = cursor.execute(sql)

            result = cursor.fetchall()
            return result
            conn.commit()
        except:
            print("query ERROR.")
            conn.rollback()



    def getContent(self,text,num):

        allData = {
            "title":"",
            "inventor":"",
            "apply_person":"",
            "category_num":"",
            "nums":"",
            "legalArray":""
        }

        legalArray = []

        Soup = BeautifulSoup(text,"html.parser")
        try:
            allData["title"] = Soup.select('meta[name="DC.title"]')[0]["content"]
        except:
            pass

        try:
            invertors = Soup.select('meta[name="DC.contributor"]')
        except:
            pass

        try:
            category_nums = Soup.select("#classifications state-modifier #link")
        except:
            pass

        try:
            allData["nums"] = Soup.select('div[class="header style-scope application-timeline"]')[1].text
        except:
            pass

        try:
            legals = Soup.select('div[class="event layout horizontal style-scope application-timeline"]')
        except:
            pass

        for inver in invertors:
            if inver["scheme"] == "inventor":
                allData["inventor"] += inver["content"] + ","

            if inver["scheme"] == "assignee":
                allData["apply_person"] = inver["content"]


        for category in category_nums:
            tt = category.text
            if len(tt) < 7:
                continue
            allData["category_num"] += tt + ","


        for legal in legals:
            tempData = {
                "key":"",
                "value":""
            }
            try:
                tempData["key"] = legal.select("div")[0].text

                tempData["value"] = legal.select("div")[2].text
            except:
                pass
            legalArray.append(tempData)
        allData["legalArray"] = legalArray

        allData["nums"] = allData["nums"].split(" ")[1]

        return allData

        # sql = "insert into soopat1(name,num,page,keyword) values(%s,%s,%s,%s)"
        # self.insertIntoMysqlMut(sql,allData)

    def transferContent(self, content):
        if content is None:
            return None
        else:
            string = ""
            for c in content:
                if c == '"':
                    string += '\\\"'
                elif c == "'":
                    string += "\\\'"
                elif c == "\\":
                    string += "\\\\"
                else:
                    string += c
            return string

    def main(self):
        while 1:
            data = self.getNum()
            if len(data) == 0:
                break

            for dd in data:
                # time.sleep(5)
                keyword = "Armyris"
                num = dd[0]

                urlParams = num + "/?="+keyword+"&oq="+keyword

                chrome_options = webdriver.ChromeOptions()
                chrome_options.binary_location = r'D:\chrome\Chrome\App\chrome.exe'
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
                client = webdriver.Chrome(executable_path='C:/Users/readline/Desktop/xutao/python/chromedriver.exe',
                                          chrome_options=chrome_options)
                # 如果没有把chromedriver加入到PATH中,就需要指明路径 executable_path='/home/chromedriver'
                # client.get("https://patents.glgoo.top/patent/"+urlParams)
                client.get("https://patents.glgoo.top/patent/"+urlParams)
                # 获取页面所有内容
                content = client.page_source
                client.close()  # 关闭连接
                ddd = self.getContent(content,num)
                print(ddd)
                sql = "update xxx set new_name='{}',person='{}',apply_person='{}',category_num='{}',nums='{}',legal='{}'  where num = '{}'".format(
                    self.transferContent(ddd["title"]),self.transferContent(ddd["inventor"]),self.transferContent(ddd["apply_person"]),self.transferContent(ddd["category_num"]),self.transferContent(ddd["nums"]),self.transferContent(json.dumps(ddd["legalArray"])),num)
                print(sql)

                # 更新数据库
                self.update(sql)

        


if __name__ == '__main__':
    Pc = Pc()
    Pc.main()
一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30