golang利用chromedp 模拟浏览器爬取数据
里面包含了chromedb常用的功能
等待元素、输入框输入值、点击元素等进入指定页面 然后等待获取当前页面所有html代码再利用goquery解析数据
package main
import (
"context"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
"regexp"
"spider/service"
"strings"
"sync"
"time"
)
var mut sync.Mutex
// 爬取 mybrandnewlogo
func main() {
options := []chromedp.ExecAllocatorOption{
chromedp.Flag("headless", false),
chromedp.Flag("blink-settings", "imageEnable=false"),
chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko)`),
}
allocCtx, _ := chromedp.NewExecAllocator(context.Background(), options...)
// create chrome instance
task1Ctx, cancel := chromedp.NewContext(
allocCtx,
)
defer cancel()
// 设置过期时间
task2Ctx, cancel := context.WithTimeout(task1Ctx, 600*time.Second)
defer cancel()
var htmlContent string
// navigate to a page, wait for an element
err := chromedp.Run(task2Ctx,
mybrandnewlogoTask(htmlContent),
)
if err != nil {
return
}
fmt.Println("---------------------over-----------------------")
}
func mybrandnewlogoTask(htmlContent string) chromedp.Tasks {
return chromedp.Tasks{
// 指定网页
chromedp.Navigate(`https://mybrandnewlogo.com/logo-maker`),
//等待指定元素出现
chromedp.WaitVisible(`#name`),
//输入公司名
chromedp.SendKeys(`#name`, "goland", chromedp.ByID),
chromedp.Click(`header > div.container > div > form > div > div > div:nth-child(1) > div.b-user-input__action > button.c-button.shine.js-next-screen`),
chromedp.Sleep(2 * time.Second),
//等待指定元素出现
chromedp.WaitVisible(`#slogan`),
//在某个输入框输入slogan
chromedp.SendKeys(`#slogan`, "goland slogan", chromedp.ByID),
//点击某个元素
chromedp.Click(`header > div.container > div > form > div > div > div:nth-child(3) > div.b-user-input__actions > button.c-button.js-next-screen`),
chromedp.Sleep(1 * time.Second),
chromedp.WaitVisible(`#industry`),
chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-industry > div.b-user-input__actions > button.c-button.js-next-screen`),
chromedp.Sleep(2 * time.Second),
chromedp.WaitVisible(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-keywords > div.b-user-input__actions > button.c-button.js-next-screen`),
chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-keywords > div.b-user-input__actions > button.c-button.js-next-screen`),
chromedp.Sleep(2 * time.Second),
chromedp.WaitVisible(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div:nth-child(6) > div.b-user-input__actions > button.c-button.js-submit`),
chromedp.Sleep(2 * time.Second),
chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div:nth-child(6) > div.b-user-input__actions > button.c-button.js-submit`),
chromedp.Sleep(2 * time.Second),
chromedp.WaitVisible(`.b-logo-wrap:nth-child(20) > div > div:nth-child(3) > svg`),
chromedp.Sleep(2 * time.Second),
// 获取整个html 数据然后解析
chromedp.OuterHTML(`document.querySelector("body")`, &htmlContent, chromedp.ByJSPath),
chromedp.ActionFunc(func(ctx context.Context) error {
doc, err1 := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err1 != nil {
return err1
}
doc.Find("div > svg").Each(func(i int, s *goquery.Selection) {
mut.Lock()
defer mut.Unlock()
n := len(s.Find("rect").Nodes) // 8 3
defsn := len(s.Find("defs rect").Nodes) // 4 1
n = n - defsn // 4 2
fmt.Println("-----------------------------n:", n)
bg, logo, name, slogan := "", "", "", ""
if n == 2 {
bg, _ = s.Find("rect").Eq(defsn).Attr("fill")
fmt.Println("------------------------bg:", bg)
logo, _ = s.Find("rect").Eq(defsn + 1).Attr("fill")
fmt.Println("------------------------logo:", logo)
name, _ = s.Find("rect").Eq(defsn + 1).Attr("fill")
fmt.Println("------------------------name:", name)
slogan, _ = s.Find("rect").Eq(defsn + 1).Attr("fill")
fmt.Println("------------------------slogan:", slogan)
if VerifyRBG(bg) && VerifyRBG(logo) && VerifyRBG(name) && VerifyRBG(slogan) {
AddData(bg, logo, name, slogan)
}
}
if n == 4 {
bg, _ = s.Find("rect").Eq(defsn).Attr("fill")
fmt.Println("------------------------bg:", bg)
logo, _ = s.Find("rect").Eq(defsn + 1).Attr("fill")
fmt.Println("------------------------logo:", logo)
name, _ = s.Find("rect").Eq(defsn + 2).Attr("fill")
fmt.Println("------------------------name:", name)
slogan, _ = s.Find("rect").Eq(defsn + 3).Attr("fill")
fmt.Println("------------------------slogan:", slogan)
if VerifyRBG(bg) && VerifyRBG(logo) && VerifyRBG(name) && VerifyRBG(slogan) {
AddData(bg, logo, name, slogan)
}
}
})
return nil
}),
chromedp.Sleep(2000 * time.Second),
}
}
func VerifyRBG(rbg string) bool {
match, _ := regexp.MatchString(`^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})$`, rbg)
return match
}