golang利用chromedp 模拟浏览器爬取数据
里面包含了chromedb常用的功能
等待元素、输入框输入值、点击元素等进入指定页面 然后等待获取当前页面所有html代码再利用goquery解析数据
package main import ( "context" "fmt" "github.com/PuerkitoBio/goquery" "github.com/chromedp/chromedp" "regexp" "spider/service" "strings" "sync" "time" ) var mut sync.Mutex // 爬取 mybrandnewlogo func main() { options := []chromedp.ExecAllocatorOption{ chromedp.Flag("headless", false), chromedp.Flag("blink-settings", "imageEnable=false"), chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko)`), } allocCtx, _ := chromedp.NewExecAllocator(context.Background(), options...) // create chrome instance task1Ctx, cancel := chromedp.NewContext( allocCtx, ) defer cancel() // 设置过期时间 task2Ctx, cancel := context.WithTimeout(task1Ctx, 600*time.Second) defer cancel() var htmlContent string // navigate to a page, wait for an element err := chromedp.Run(task2Ctx, mybrandnewlogoTask(htmlContent), ) if err != nil { return } fmt.Println("---------------------over-----------------------") } func mybrandnewlogoTask(htmlContent string) chromedp.Tasks { return chromedp.Tasks{ // 指定网页 chromedp.Navigate(`https://mybrandnewlogo.com/logo-maker`), //等待指定元素出现 chromedp.WaitVisible(`#name`), //输入公司名 chromedp.SendKeys(`#name`, "goland", chromedp.ByID), chromedp.Click(`header > div.container > div > form > div > div > div:nth-child(1) > div.b-user-input__action > button.c-button.shine.js-next-screen`), chromedp.Sleep(2 * time.Second), //等待指定元素出现 chromedp.WaitVisible(`#slogan`), //在某个输入框输入slogan chromedp.SendKeys(`#slogan`, "goland slogan", chromedp.ByID), //点击某个元素 chromedp.Click(`header > div.container > div > form > div > div > div:nth-child(3) > div.b-user-input__actions > button.c-button.js-next-screen`), chromedp.Sleep(1 * time.Second), chromedp.WaitVisible(`#industry`), chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-industry > div.b-user-input__actions > button.c-button.js-next-screen`), chromedp.Sleep(2 * time.Second), chromedp.WaitVisible(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-keywords > div.b-user-input__actions > button.c-button.js-next-screen`), chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div.b-user-input__screen.js-screen.js-screen-keywords > div.b-user-input__actions > button.c-button.js-next-screen`), chromedp.Sleep(2 * time.Second), chromedp.WaitVisible(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div:nth-child(6) > div.b-user-input__actions > button.c-button.js-submit`), chromedp.Sleep(2 * time.Second), chromedp.Click(`body > div.js-scroll.l-main > header > div.container > div > form > div > div > div:nth-child(6) > div.b-user-input__actions > button.c-button.js-submit`), chromedp.Sleep(2 * time.Second), chromedp.WaitVisible(`.b-logo-wrap:nth-child(20) > div > div:nth-child(3) > svg`), chromedp.Sleep(2 * time.Second), // 获取整个html 数据然后解析 chromedp.OuterHTML(`document.querySelector("body")`, &htmlContent, chromedp.ByJSPath), chromedp.ActionFunc(func(ctx context.Context) error { doc, err1 := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err1 != nil { return err1 } doc.Find("div > svg").Each(func(i int, s *goquery.Selection) { mut.Lock() defer mut.Unlock() n := len(s.Find("rect").Nodes) // 8 3 defsn := len(s.Find("defs rect").Nodes) // 4 1 n = n - defsn // 4 2 fmt.Println("-----------------------------n:", n) bg, logo, name, slogan := "", "", "", "" if n == 2 { bg, _ = s.Find("rect").Eq(defsn).Attr("fill") fmt.Println("------------------------bg:", bg) logo, _ = s.Find("rect").Eq(defsn + 1).Attr("fill") fmt.Println("------------------------logo:", logo) name, _ = s.Find("rect").Eq(defsn + 1).Attr("fill") fmt.Println("------------------------name:", name) slogan, _ = s.Find("rect").Eq(defsn + 1).Attr("fill") fmt.Println("------------------------slogan:", slogan) if VerifyRBG(bg) && VerifyRBG(logo) && VerifyRBG(name) && VerifyRBG(slogan) { AddData(bg, logo, name, slogan) } } if n == 4 { bg, _ = s.Find("rect").Eq(defsn).Attr("fill") fmt.Println("------------------------bg:", bg) logo, _ = s.Find("rect").Eq(defsn + 1).Attr("fill") fmt.Println("------------------------logo:", logo) name, _ = s.Find("rect").Eq(defsn + 2).Attr("fill") fmt.Println("------------------------name:", name) slogan, _ = s.Find("rect").Eq(defsn + 3).Attr("fill") fmt.Println("------------------------slogan:", slogan) if VerifyRBG(bg) && VerifyRBG(logo) && VerifyRBG(name) && VerifyRBG(slogan) { AddData(bg, logo, name, slogan) } } }) return nil }), chromedp.Sleep(2000 * time.Second), } } func VerifyRBG(rbg string) bool { match, _ := regexp.MatchString(`^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})$`, rbg) return match }