Skip to content

Commit

Permalink
Add -w flag to allow the websockets debugging endpoint of a running
Browse files Browse the repository at this point in the history
chromium instance to be specified
  • Loading branch information
DanielIntruder committed Jun 30, 2023
1 parent 7d4f486 commit ec2eeec
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 12 deletions.
24 changes: 18 additions & 6 deletions cmd/crawlergo/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ package main

import (
"fmt"

"github.com/Qianlitp/crawlergo/pkg/config"
"github.com/urfave/cli/v2"
)

var cliFlags = []cli.Flag{
SetChromePath(),
SetChromeWSUrl(),
SetCustomHeaders(),
SetPostData(),
SetMaxCrawledCount(),
Expand Down Expand Up @@ -45,6 +47,16 @@ func SetChromePath() *cli.PathFlag {
}
}

func SetChromeWSUrl() *cli.StringFlag {
return &cli.StringFlag{
Name: "chrome-ws-url",
Aliases: []string{"w"},
Usage: "`URL` of chromium websockets debugger. Please note that any chromium flags specified will not apply when using this option. Such as \"ws://127.0.0.1:9222/devtools/browser/94b947d7-2a08-4cee-8e21-d77055e8c465\"",
Destination: &taskConfig.ChromiumWSUrl,
EnvVars: []string{"CRAWLERGO_CHROME_WS_URL"},
}
}

func SetCustomHeaders() *cli.StringFlag {
return &cli.StringFlag{
Name: "custom-headers",
Expand Down Expand Up @@ -145,12 +157,12 @@ func SetRequestProxy() *cli.StringFlag {
}
}

// return &cli.BoolFlag{
// Name: "bypass",
// Value: false,
// Usage: "whether to encode url with detected charset.",
// Destination: &taskConfig.EncodeURLWithCharset,
//},
// return &cli.BoolFlag{
// Name: "bypass",
// Value: false,
// Usage: "whether to encode url with detected charset.",
// Destination: &taskConfig.EncodeURLWithCharset,
// },
func SetEncodeURL() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "encode-url",
Expand Down
19 changes: 19 additions & 0 deletions pkg/engine/browser.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,25 @@ func InitBrowser(chromiumPath string, extraHeaders map[string]interface{}, proxy
return &bro
}

func ConnectBrowser(wsUrl string, extraHeaders map[string]interface{}) *Browser {
var bro Browser
allocCtx, cancel := chromedp.NewRemoteAllocator(context.Background(), wsUrl)
bctx, _ := chromedp.NewContext(allocCtx,
chromedp.WithLogf(log.Printf),
)

err := chromedp.Run(bctx)
if err != nil {
// couldn't connect to the remote browser, need to exit
logger.Logger.Fatal("chromedp run error: ", err.Error())
}
bro.Cancel = &cancel
bro.Ctx = &bctx
bro.ExtraHeaders = extraHeaders

return &bro
}

func (bro *Browser) NewTab(timeout time.Duration) (*context.Context, context.CancelFunc) {
bro.lock.Lock()
ctx, cancel := chromedp.NewContext(*bro.Ctx)
Expand Down
21 changes: 15 additions & 6 deletions pkg/task_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ type tabTask struct {
req *model.Request
}

/**
/*
*
新建爬虫任务
*/
func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask, error) {
Expand Down Expand Up @@ -95,7 +96,11 @@ func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask
}
}

crawlerTask.Browser = engine2.InitBrowser(taskConf.ChromiumPath, taskConf.ExtraHeaders, taskConf.Proxy, taskConf.NoHeadless)
if len(taskConf.ChromiumWSUrl) > 0 {
crawlerTask.Browser = engine2.ConnectBrowser(taskConf.ChromiumWSUrl, taskConf.ExtraHeaders)
} else {
crawlerTask.Browser = engine2.InitBrowser(taskConf.ChromiumPath, taskConf.ExtraHeaders, taskConf.Proxy, taskConf.NoHeadless)
}
crawlerTask.RootDomain = targets[0].URL.RootDomain()

crawlerTask.smartFilter.Init()
Expand All @@ -107,7 +112,8 @@ func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask
return &crawlerTask, nil
}

/**
/*
*
根据请求列表生成tabTask协程任务列表
*/
func (t *CrawlerTask) generateTabTask(req *model.Request) *tabTask {
Expand All @@ -119,7 +125,8 @@ func (t *CrawlerTask) generateTabTask(req *model.Request) *tabTask {
return &task
}

/**
/*
*
开始当前任务
*/
func (t *CrawlerTask) Run() {
Expand Down Expand Up @@ -183,7 +190,8 @@ func (t *CrawlerTask) Run() {
t.Result.SubDomainList = SubDomainCollect(t.Result.AllReqList, t.RootDomain)
}

/**
/*
*
添加任务到协程池
添加之前实时过滤
*/
Expand All @@ -208,7 +216,8 @@ func (t *CrawlerTask) addTask2Pool(req *model.Request) {
}()
}

/**
/*
*
单个运行的tab标签任务,实现了workpool的接口
*/
func (t *tabTask) Task() {
Expand Down
1 change: 1 addition & 0 deletions pkg/taskconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type TaskConfig struct {
PathFromRobots bool // 解析Robots文件找出路径
MaxTabsCount int // 允许开启的最大标签页数量 即同时爬取的数量
ChromiumPath string // Chromium的程序路径 `/home/zhusiyu1/chrome-linux/chrome`
ChromiumWSUrl string // Websocket debugging URL for a running chrome session
EventTriggerMode string // 事件触发的调用方式: 异步 或 顺序
EventTriggerInterval time.Duration // 事件触发的间隔
BeforeExitDelay time.Duration // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获
Expand Down

0 comments on commit ec2eeec

Please sign in to comment.