Skip to content

Commit

Permalink
fix(filter): url fragment deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
黄崇正 committed Jun 8, 2022
1 parent 551acb2 commit e057d54
Show file tree
Hide file tree
Showing 26 changed files with 767 additions and 362 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.vscode
.idea
bin
upload
result
.DS_Store
internal/.DS_Store
pkg/.DS_Store
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.PHONY: build_all
# build
build_all:
rm -rf bin && mkdir bin bin/linux-amd64 bin/linux-arm64 bin/darwin-amd64 bin/darwin-arm64 \
&& CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/darwin-arm64/ ./... \
&& CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/darwin-amd64/ ./... \
&& CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/linux-arm64/ ./... \
&& CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/linux-amd64/ ./...

.PHONY: build
# build
build:
rm -rf bin && mkdir bin && go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/ ./...
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ crawlergo currently supports the following features:

**Build**

- compilation for current platform

```shell
make build
```

- compile for all platforms
```shell
cd crawlergo/cmd/crawlergo
go build crawlergo_cmd.go
make build_all
```

1. crawlergo relies only on the chrome environment to run, go to [download](https://www.chromium.org/getting-involved/download-chromium) for the new version of chromium.
Expand All @@ -45,14 +51,14 @@ go build crawlergo_cmd.go
Assuming your chromium installation directory is `/tmp/chromium/`, set up 10 tabs open at the same time and crawl the `testphp.vulnweb.com`:

```shell
./crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
```


### Using Proxy

```shell
./crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
```


Expand All @@ -70,7 +76,7 @@ import subprocess

def main():
target = "http://testphp.vulnweb.com/"
cmd = ["./crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = rsp.communicate()
# "--[Mission Complete]--" is the end-of-task separator string
Expand Down
16 changes: 11 additions & 5 deletions README_zh-cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ crawlergo 目前支持以下特性:

**Build**

- 编译适用于当前机器的文件

```shell
make build
```

- 交叉编译所有平台的文件
```shell
cd crawlergo/cmd/crawlergo
go build crawlergo_cmd.go
make build_all
```

1. crawlergo 只依赖chrome运行即可,前往[下载](https://www.chromium.org/getting-involved/download-chromium)新版本的chromium。
Expand All @@ -46,15 +52,15 @@ go build crawlergo_cmd.go
假设你的chromium安装在 `/tmp/chromium/` ,开启最大10标签页,爬取AWVS靶场:

```shell
./crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
```



### 使用代理

```shell
./crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
```


Expand All @@ -73,7 +79,7 @@ import subprocess

def main():
target = "http://testphp.vulnweb.com/"
cmd = ["./crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = rsp.communicate()
# "--[Mission Complete]--" 是任务结束的分隔字符串
Expand Down
284 changes: 284 additions & 0 deletions cmd/crawlergo/flag.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
package main

import (
"fmt"

"github.com/Qianlitp/crawlergo/pkg/config"

"github.com/urfave/cli/v2"
)

var cliFlags = []cli.Flag{
SetChromePath(),
SetCustomHeaders(),
SetPostData(),
SetMaxCrawledCount(),
SetFilterMod(),
SetOutputMode(),
SetOutputJSON(),
SetIgcognitoContext(),
SetMaxTabCount(),
SetFuzzPath(),
SetFuzzPathDict(),
SetRobotsPath(),
SetRequestProxy(),
SetEncodeURL(),
SetTabRunTTL(),
SetWaitDomContentLoadedTTL(),
SetEventTriggerMode(),
SetEventTriggerInterval(),
SetBeforeExitDelay(),
SetIgnoreUrlKeywords(),
SetFormValues(),
SetFormKeywordValue(),
SetPushToProxy(),
SetPushPoolMax(),
SetLogLevel(),
SetNoHeadless(),
}

func SetChromePath() *cli.PathFlag {
return &cli.PathFlag{
Name: "chromium-path",
Aliases: []string{"c"},
Usage: "`Path` of chromium executable. Such as \"/home/test/chrome-linux/chrome\"",
Required: true,
Destination: &taskConfig.ChromiumPath,
EnvVars: []string{"CRAWLERGO_CHROMIUM_PATH"},
}
}

func SetCustomHeaders() *cli.StringFlag {
return &cli.StringFlag{
Name: "custom-headers",
Usage: "add additional `Headers` to each request. The input string will be called json.Unmarshal",
Value: fmt.Sprintf(`{"Spider-Name": "crawlergo", "User-Agent": "%s"}`, config.DefaultUA),
Destination: &taskConfig.ExtraHeadersString,
}
}

func SetPostData() *cli.StringFlag {
return &cli.StringFlag{
Name: "post-data",
Aliases: []string{"d"},
Usage: "set `PostData` to target and use POST method.",
Destination: &postData,
}
}

func SetMaxCrawledCount() *cli.IntFlag {
return &cli.IntFlag{
Name: "max-crawled-count",
Aliases: []string{"m"},
Value: config.MaxCrawlCount,
Usage: "the maximum `Number` of URLs visited by the crawler in this task.",
Destination: &taskConfig.MaxCrawlCount,
}
}

func SetFilterMod() *cli.StringFlag {
return &cli.StringFlag{
Name: "filter-mode",
Aliases: []string{"f"},
Value: "smart",
Usage: "filtering `Mode` used for collected requests. Allowed mode:\"simple\", \"smart\" or \"strict\".",
Destination: &taskConfig.FilterMode,
}
}

func SetOutputMode() *cli.StringFlag {
return &cli.StringFlag{
Name: "output-mode",
Aliases: []string{"o"},
Value: "console",
Usage: "console print or serialize output. Allowed mode:\"console\" ,\"json\" or \"none\".",
Destination: &outputMode,
}
}

func SetOutputJSON() *cli.StringFlag {
return &cli.StringFlag{
Name: "output-dir",
Usage: "write output to a json file.Such as result_www_test_com.json",
Destination: &outputJsonPath,
}
}

func SetIgcognitoContext() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "incognito-context",
Aliases: []string{"i"},
Value: true,
Usage: "whether the browser is launched in incognito mode.",
Destination: &taskConfig.IncognitoContext,
}
}

func SetMaxTabCount() *cli.IntFlag {
return &cli.IntFlag{
Name: "max-tab-count",
Aliases: []string{"t"},
Value: 8,
Usage: "maximum `Number` of tabs allowed.",
Destination: &taskConfig.MaxTabsCount,
}
}

func SetFuzzPath() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "fuzz-path",
Value: false,
Usage: "whether to fuzz the target with common paths.",
Destination: &taskConfig.PathByFuzz,
}
}

func SetFuzzPathDict() *cli.PathFlag {
return &cli.PathFlag{
Name: "fuzz-path-dict",
Usage: "`Path` of fuzz dict. Such as \"/home/test/fuzz_path.txt\"",
Destination: &taskConfig.FuzzDictPath,
}
}

func SetRobotsPath() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "robots-path",
Value: false,
Usage: "whether to resolve paths from /robots.txt.",
Destination: &taskConfig.PathFromRobots,
}
}

func SetRequestProxy() *cli.StringFlag {
return &cli.StringFlag{
Name: "request-proxy",
Usage: "all requests connect through defined proxy server.",
Destination: &taskConfig.Proxy,
}
}

// return &cli.BoolFlag{
// Name: "bypass",
// Value: false,
// Usage: "whether to encode url with detected charset.",
// Destination: &taskConfig.EncodeURLWithCharset,
//},
func SetEncodeURL() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "encode-url",
Value: false,
Usage: "whether to encode url with detected charset.",
Destination: &taskConfig.EncodeURLWithCharset,
}
}

func SetTabRunTTL() *cli.DurationFlag {

return &cli.DurationFlag{
Name: "tab-run-timeout",
Value: config.TabRunTimeout,
Usage: "the `Timeout` of a single tab task.",
Destination: &taskConfig.TabRunTimeout,
}
}

func SetWaitDomContentLoadedTTL() *cli.DurationFlag {
return &cli.DurationFlag{
Name: "wait-dom-content-loaded-timeout",
Value: config.DomContentLoadedTimeout,
Usage: "the `Timeout` of waiting for a page dom ready.",
Destination: &taskConfig.DomContentLoadedTimeout,
}
}

func SetEventTriggerMode() *cli.StringFlag {
return &cli.StringFlag{
Name: "event-trigger-mode",
Value: config.EventTriggerAsync,
Usage: "this `Value` determines how the crawler automatically triggers events.Allowed mode:\"async\" or \"sync\".",
Destination: &taskConfig.EventTriggerMode,
}
}

func SetEventTriggerInterval() *cli.DurationFlag {
return &cli.DurationFlag{
Name: "event-trigger-interval",
Value: config.EventTriggerInterval,
Usage: "the `Interval` of triggering each event.",
Destination: &taskConfig.EventTriggerInterval,
}
}

func SetBeforeExitDelay() *cli.DurationFlag {
return &cli.DurationFlag{
Name: "before-exit-delay",
Value: config.BeforeExitDelay,
Usage: "the `Time` of waiting before crawler exit.",
Destination: &taskConfig.BeforeExitDelay,
}
}

func SetIgnoreUrlKeywords() *cli.StringSliceFlag {
return &cli.StringSliceFlag{
Name: "ignore-url-keywords",
Aliases: []string{"iuk"},
Value: ignoreKeywords,
Usage: "crawlergo will not crawl these URLs matched by `Keywords`. e.g.: -iuk logout -iuk quit -iuk exit",
DefaultText: "Default [logout quit exit]",
}
}

func SetFormValues() *cli.StringSliceFlag {
return &cli.StringSliceFlag{
Name: "form-values",
Aliases: []string{"fv"},
Value: customFormTypeValues,
Usage: "custom filling text for each form type. e.g.: -fv username=crawlergo_nice -fv password=admin123",
}
}

// 根据关键词自行选择填充文本
func SetFormKeywordValue() *cli.StringSliceFlag {
return &cli.StringSliceFlag{
Name: "form-keyword-values",
Aliases: []string{"fkv"},
Value: customFormKeywordValues,
Usage: "custom filling text, fuzzy matched by keyword. e.g.: -fkv user=crawlergo_nice -fkv pass=admin123",
}
}

func SetPushToProxy() *cli.StringFlag {
return &cli.StringFlag{
Name: "push-to-proxy",
Usage: "every request in 'req_list' will be pushed to the proxy `Address`. Such as \"http://127.0.0.1:8080/\"",
Destination: &pushAddress,
}
}

func SetPushPoolMax() *cli.IntFlag {
return &cli.IntFlag{
Name: "push-pool-max",
Usage: "maximum `Number` of concurrency when pushing results to proxy.",
Value: DefaultMaxPushProxyPoolMax,
Destination: &pushProxyPoolMax,
}
}

func SetLogLevel() *cli.StringFlag {
return &cli.StringFlag{
Name: "log-level",
Usage: "log print `Level`, options include debug, info, warn, error and fatal.",
Value: DefaultLogLevel,
Destination: &logLevel,
}
}

func SetNoHeadless() *cli.BoolFlag {
return &cli.BoolFlag{
Name: "no-headless",
Value: false,
Usage: "no headless mode",
Destination: &taskConfig.NoHeadless,
}
}
Loading

0 comments on commit e057d54

Please sign in to comment.