Skip to content

Commit

Permalink
Merge pull request #145 from DanielIntruder/chrome-ws-url
Browse files Browse the repository at this point in the history
Allow running chrome instance to be used
  • Loading branch information
pengdaCN authored Jul 3, 2023
2 parents d31a1b4 + ec2eeec commit 89c0209
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 1 deletion.
11 changes: 11 additions & 0 deletions cmd/crawlergo/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

var cliFlags = []cli.Flag{
SetChromePath(),
SetChromeWSUrl(),
SetCustomHeaders(),
SetPostData(),
SetMaxCrawledCount(),
Expand Down Expand Up @@ -47,6 +48,16 @@ func SetChromePath() *cli.PathFlag {
}
}

func SetChromeWSUrl() *cli.StringFlag {
return &cli.StringFlag{
Name: "chrome-ws-url",
Aliases: []string{"w"},
Usage: "`URL` of chromium websockets debugger. Please note that any chromium flags specified will not apply when using this option. Such as \"ws://127.0.0.1:9222/devtools/browser/94b947d7-2a08-4cee-8e21-d77055e8c465\"",
Destination: &taskConfig.ChromiumWSUrl,
EnvVars: []string{"CRAWLERGO_CHROME_WS_URL"},
}
}

func SetCustomHeaders() *cli.StringFlag {
return &cli.StringFlag{
Name: "custom-headers",
Expand Down
19 changes: 19 additions & 0 deletions pkg/engine/browser.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,25 @@ func InitBrowser(chromiumPath string, extraHeaders map[string]interface{}, proxy
return &bro
}

func ConnectBrowser(wsUrl string, extraHeaders map[string]interface{}) *Browser {
var bro Browser
allocCtx, cancel := chromedp.NewRemoteAllocator(context.Background(), wsUrl)
bctx, _ := chromedp.NewContext(allocCtx,
chromedp.WithLogf(log.Printf),
)

err := chromedp.Run(bctx)
if err != nil {
// couldn't connect to the remote browser, need to exit
logger.Logger.Fatal("chromedp run error: ", err.Error())
}
bro.Cancel = &cancel
bro.Ctx = &bctx
bro.ExtraHeaders = extraHeaders

return &bro
}

func (bro *Browser) NewTab(timeout time.Duration) (*context.Context, context.CancelFunc) {
bro.lock.Lock()
ctx, cancel := chromedp.NewContext(*bro.Ctx)
Expand Down
6 changes: 5 additions & 1 deletion pkg/task_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask
}
}

crawlerTask.Browser = engine2.InitBrowser(taskConf.ChromiumPath, taskConf.ExtraHeaders, taskConf.Proxy, taskConf.NoHeadless)
if len(taskConf.ChromiumWSUrl) > 0 {
crawlerTask.Browser = engine2.ConnectBrowser(taskConf.ChromiumWSUrl, taskConf.ExtraHeaders)
} else {
crawlerTask.Browser = engine2.InitBrowser(taskConf.ChromiumPath, taskConf.ExtraHeaders, taskConf.Proxy, taskConf.NoHeadless)
}
crawlerTask.RootDomain = targets[0].URL.RootDomain()

// 创建协程池
Expand Down
1 change: 1 addition & 0 deletions pkg/taskconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type TaskConfig struct {
PathFromRobots bool // 解析Robots文件找出路径
MaxTabsCount int // 允许开启的最大标签页数量 即同时爬取的数量
ChromiumPath string // Chromium的程序路径 `/home/zhusiyu1/chrome-linux/chrome`
ChromiumWSUrl string // Websocket debugging URL for a running chrome session
EventTriggerMode string // 事件触发的调用方式: 异步 或 顺序
EventTriggerInterval time.Duration // 事件触发的间隔
BeforeExitDelay time.Duration // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获
Expand Down

0 comments on commit 89c0209

Please sign in to comment.