Skip to content

Commit

Permalink
add: 增加爬虫整体运行 最大超时时间
Browse files Browse the repository at this point in the history
  • Loading branch information
HeisenbergV committed Jan 6, 2023
1 parent 7d4f486 commit 675d4e3
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 18 deletions.
23 changes: 17 additions & 6 deletions cmd/crawlergo/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"fmt"

"github.com/Qianlitp/crawlergo/pkg/config"
"github.com/urfave/cli/v2"
)
Expand Down Expand Up @@ -33,6 +34,7 @@ var cliFlags = []cli.Flag{
SetPushPoolMax(),
SetLogLevel(),
SetNoHeadless(),
SetMaxTime(),
}

func SetChromePath() *cli.PathFlag {
Expand Down Expand Up @@ -145,12 +147,12 @@ func SetRequestProxy() *cli.StringFlag {
}
}

// return &cli.BoolFlag{
// Name: "bypass",
// Value: false,
// Usage: "whether to encode url with detected charset.",
// Destination: &taskConfig.EncodeURLWithCharset,
//},
// return &cli.BoolFlag{
// Name: "bypass",
// Value: false,
// Usage: "whether to encode url with detected charset.",
// Destination: &taskConfig.EncodeURLWithCharset,
// },
func SetEncodeURL() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "encode-url",
Expand Down Expand Up @@ -270,3 +272,12 @@ func SetNoHeadless() *cli.BoolFlag {
Destination: &taskConfig.NoHeadless,
}
}

func SetMaxTime() *cli.Int64Flag {
return &cli.Int64Flag{
Name: "max-run-time",
Usage: "the `Timeout` of the task.",
Value: config.MaxRunTime,
Destination: &taskConfig.MaxRunTime,
}
}
15 changes: 9 additions & 6 deletions cmd/crawlergo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"
"sync"
"syscall"
"time"

"github.com/Qianlitp/crawlergo/pkg"
"github.com/Qianlitp/crawlergo/pkg/config"
Expand Down Expand Up @@ -152,8 +153,8 @@ func run(c *cli.Context) error {
os.Exit(-1)
}
if len(targets) != 0 {
logger.Logger.Info(fmt.Sprintf("Init crawler task, host: %s, max tab count: %d, max crawl count: %d.",
targets[0].URL.Host, taskConfig.MaxTabsCount, taskConfig.MaxCrawlCount))
logger.Logger.Infof("Init crawler task, host: %s, max tab count: %d, max crawl count: %d, max runtime: %ds",
targets[0].URL.Host, taskConfig.MaxTabsCount, taskConfig.MaxCrawlCount, taskConfig.MaxRunTime)
logger.Logger.Info("filter mode: ", taskConfig.FilterMode)
}

Expand All @@ -175,8 +176,8 @@ func run(c *cli.Context) error {
task.Run()
result := task.Result

logger.Logger.Info(fmt.Sprintf("Task finished, %d results, %d requests, %d subdomains, %d domains found.",
len(result.ReqList), len(result.AllReqList), len(result.SubDomainList), len(result.AllDomainList)))
logger.Logger.Infof("Task finished, %d results, %d requests, %d subdomains, %d domains found, runtime: %d",
len(result.ReqList), len(result.AllReqList), len(result.SubDomainList), len(result.AllDomainList), time.Now().Unix()-task.Start.Unix())

// 内置请求代理
if pushAddress != "" {
Expand Down Expand Up @@ -254,7 +255,8 @@ func outputResult(result *pkg.Result) {
}
}

/**
/*
*
原生被动代理推送支持
*/
func Push2Proxy(reqList []*model2.Request) {
Expand All @@ -277,7 +279,8 @@ func Push2Proxy(reqList []*model2.Request) {
pushProxyWG.Wait()
}

/**
/*
*
协程池请求的任务
*/
func (p *ProxyTask) doRequest() {
Expand Down
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const (
BeforeExitDelay = 1 * time.Second
DefaultEventTriggerMode = EventTriggerAsync
MaxCrawlCount = 200
MaxRunTime = 60 * 60
)

// 请求方法
Expand Down
37 changes: 31 additions & 6 deletions pkg/task_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package pkg
import (
"encoding/json"
"sync"
"time"

"github.com/Qianlitp/crawlergo/pkg/config"
engine2 "github.com/Qianlitp/crawlergo/pkg/engine"
Expand All @@ -24,6 +25,7 @@ type CrawlerTask struct {
taskWG sync.WaitGroup // 等待协程池所有任务结束
crawledCount int // 爬取过的数量
taskCountLock sync.Mutex // 已爬取的任务总数锁
Start time.Time //开始时间
}

type Result struct {
Expand All @@ -40,7 +42,8 @@ type tabTask struct {
req *model.Request
}

/**
/*
*
新建爬虫任务
*/
func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask, error) {
Expand Down Expand Up @@ -107,7 +110,8 @@ func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask
return &crawlerTask, nil
}

/**
/*
*
根据请求列表生成tabTask协程任务列表
*/
func (t *CrawlerTask) generateTabTask(req *model.Request) *tabTask {
Expand All @@ -119,13 +123,15 @@ func (t *CrawlerTask) generateTabTask(req *model.Request) *tabTask {
return &task
}

/**
/*
*
开始当前任务
*/
func (t *CrawlerTask) Run() {
defer t.Pool.Release() // 释放协程池
defer t.Browser.Close() // 关闭浏览器

t.Start = time.Now()
if t.Config.PathFromRobots {
reqsFromRobots := GetPathsFromRobots(*t.Targets[0])
logger.Logger.Info("get paths from robots.txt: ", len(reqsFromRobots))
Expand Down Expand Up @@ -183,7 +189,8 @@ func (t *CrawlerTask) Run() {
t.Result.SubDomainList = SubDomainCollect(t.Result.AllReqList, t.RootDomain)
}

/**
/*
*
添加任务到协程池
添加之前实时过滤
*/
Expand All @@ -195,6 +202,11 @@ func (t *CrawlerTask) addTask2Pool(req *model.Request) {
} else {
t.crawledCount += 1
}

if t.Start.Add(time.Second * time.Duration(t.Config.MaxRunTime)).Before(time.Now()) {
t.taskCountLock.Unlock()
return
}
t.taskCountLock.Unlock()

t.taskWG.Add(1)
Expand All @@ -208,13 +220,26 @@ func (t *CrawlerTask) addTask2Pool(req *model.Request) {
}()
}

/**
/*
*
单个运行的tab标签任务,实现了workpool的接口
*/
func (t *tabTask) Task() {
defer t.crawlerTask.taskWG.Done()

// 设置tab超时时间,若设置了程序最大运行时间, tab超时时间和程序剩余时间取小
timeremaining := t.crawlerTask.Start.Add(time.Duration(t.crawlerTask.Config.MaxRunTime) * time.Second).Sub(time.Now())
tabTime := t.crawlerTask.Config.TabRunTimeout
if t.crawlerTask.Config.TabRunTimeout > timeremaining {
tabTime = timeremaining
}

if tabTime <= 0 {
return
}

tab := engine2.NewTab(t.browser, *t.req, engine2.TabConfig{
TabRunTimeout: t.crawlerTask.Config.TabRunTimeout,
TabRunTimeout: tabTime,
DomContentLoadedTimeout: t.crawlerTask.Config.DomContentLoadedTimeout,
EventTriggerMode: t.crawlerTask.Config.EventTriggerMode,
EventTriggerInterval: t.crawlerTask.Config.EventTriggerInterval,
Expand Down
1 change: 1 addition & 0 deletions pkg/taskconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type TaskConfig struct {
Proxy string // 请求代理
CustomFormValues map[string]string // 自定义表单填充参数
CustomFormKeywordValues map[string]string // 自定义表单关键词填充内容
MaxRunTime int64 // 最大爬取时间(单位秒),超时则结束任务,平滑结束(比如某个url还未处理完不能结束,需要一次req完成后才可以结束整个任务)
}

type TaskConfigOptFunc func(*TaskConfig)
Expand Down

0 comments on commit 675d4e3

Please sign in to comment.