diff --git a/cmd/crawlergo/crawlergo_cmd b/cmd/crawlergo/crawlergo_cmd new file mode 100755 index 0000000..005b4d2 Binary files /dev/null and b/cmd/crawlergo/crawlergo_cmd differ diff --git a/cmd/crawlergo/crawlergo_cmd.go b/cmd/crawlergo/crawlergo_cmd.go new file mode 100755 index 0000000..6da1297 --- /dev/null +++ b/cmd/crawlergo/crawlergo_cmd.go @@ -0,0 +1,495 @@ +package main + +import ( + "crawlergo/pkg" + "crawlergo/pkg/config" + "crawlergo/pkg/logger" + model2 "crawlergo/pkg/model" + "crawlergo/pkg/tools" + "crawlergo/pkg/tools/requests" + "encoding/json" + "errors" + "fmt" + "github.com/panjf2000/ants/v2" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" + "log" + "os" + "os/signal" + "strings" + "sync" +) + +/** +命令行调用适配器 + +用于生成开源的二进制程序 +*/ + +type Result struct { + ReqList []Request `json:"req_list"` + AllReqList []Request `json:"all_req_list"` + AllDomainList []string `json:"all_domain_list"` + SubDomainList []string `json:"sub_domain_list"` +} + +type Request struct { + Url string `json:"url"` + Method string `json:"method"` + Headers map[string]interface{} `json:"headers"` + Data string `json:"data"` + Source string `json:"source"` +} + +type ProxyTask struct { + req *model2.Request + pushProxy string +} + +const DefaultMaxPushProxyPoolMax = 10 +const DefaultLogLevel = "Info" + +var taskConfig pkg.TaskConfig +var outputMode string +var postData string +var signalChan chan os.Signal +var ignoreKeywords *cli.StringSlice +var customFormTypeValues *cli.StringSlice +var customFormKeywordValues *cli.StringSlice +var pushAddress string +var pushProxyPoolMax int +var pushProxyWG sync.WaitGroup +var outputJsonPath string +var logLevel string + +func main() { + author := cli.Author{ + Name: "9ian1i", + Email: "9ian1itp@gmail.com", + } + + ignoreKeywords = cli.NewStringSlice(config.DefaultIgnoreKeywords...) + customFormTypeValues = cli.NewStringSlice() + customFormKeywordValues = cli.NewStringSlice() + + app := &cli.App{ + Name: "crawlergo", + Usage: "A powerful dynamic crawler for web vulnerability scanners", + UsageText: "crawlergo [global options] url1 url2 url3 ... (must be same host)", + Description: "crawlergo is part of the 360 Skyphenomena. you can visit [https://skp.360.cn/] to learn more.", + Version: "v0.4.1", + Authors: []*cli.Author{&author}, + Flags: []cli.Flag{ + &cli.PathFlag{ + Name: "chromium-path", + Aliases: []string{"c"}, + Usage: "`Path` of chromium executable. Such as \"/home/test/chrome-linux/chrome\"", + Required: true, + Destination: &taskConfig.ChromiumPath, + EnvVars: []string{"CRAWLERGO_CHROMIUM_PATH"}, + }, + &cli.StringFlag{ + Name: "custom-headers", + Usage: "add additional `Headers` to each request. The input string will be called json.Unmarshal", + Value: fmt.Sprintf(`{"Spider-Name": "crawlergo", "User-Agent": "%s"}`, config.DefaultUA), + Destination: &taskConfig.ExtraHeadersString, + }, + &cli.StringFlag{ + Name: "post-data", + Aliases: []string{"d"}, + Usage: "set `PostData` to target and use POST method.", + Destination: &postData, + }, + &cli.IntFlag{ + Name: "max-crawled-count", + Aliases: []string{"m"}, + Value: config.MaxCrawlCount, + Usage: "the maximum `Number` of URLs visited by the crawler in this task.", + Destination: &taskConfig.MaxCrawlCount, + }, + &cli.StringFlag{ + Name: "filter-mode", + Aliases: []string{"f"}, + Value: "smart", + Usage: "filtering `Mode` used for collected requests. Allowed mode:\"simple\", \"smart\" or \"strict\".", + Destination: &taskConfig.FilterMode, + }, + &cli.StringFlag{ + Name: "output-mode", + Aliases: []string{"o"}, + Value: "console", + Usage: "console print or serialize output. Allowed mode:\"console\" ,\"json\" or \"none\".", + Destination: &outputMode, + }, + &cli.StringFlag{ + Name: "output-json", + Usage: "write output to a json file.Such as result_www_crawlergo_com.json", + Destination: &outputJsonPath, + }, + &cli.BoolFlag{ + Name: "incognito-context", + Aliases: []string{"i"}, + Value: true, + Usage: "whether the browser is launched in incognito mode.", + Destination: &taskConfig.IncognitoContext, + }, + &cli.IntFlag{ + Name: "max-tab-count", + Aliases: []string{"t"}, + Value: 8, + Usage: "maximum `Number` of tabs allowed.", + Destination: &taskConfig.MaxTabsCount, + }, + &cli.BoolFlag{ + Name: "fuzz-path", + Value: false, + Usage: "whether to fuzz the target with common paths.", + Destination: &taskConfig.PathByFuzz, + }, + &cli.PathFlag{ + Name: "fuzz-path-dict", + Usage: "`Path` of fuzz dict. Such as \"/home/test/fuzz_path.txt\"", + Destination: &taskConfig.FuzzDictPath, + }, + &cli.BoolFlag{ + Name: "robots-path", + Value: false, + Usage: "whether to resolve paths from /robots.txt.", + Destination: &taskConfig.PathFromRobots, + }, + &cli.StringFlag{ + Name: "request-proxy", + Usage: "all requests connect through defined proxy server.", + Destination: &taskConfig.Proxy, + }, + //&cli.BoolFlag{ + // Name: "bypass", + // Value: false, + // Usage: "whether to encode url with detected charset.", + // Destination: &taskConfig.EncodeURLWithCharset, + //}, + &cli.BoolFlag{ + Name: "encode-url", + Value: false, + Usage: "whether to encode url with detected charset.", + Destination: &taskConfig.EncodeURLWithCharset, + }, + &cli.DurationFlag{ + Name: "tab-run-timeout", + Value: config.TabRunTimeout, + Usage: "the `Timeout` of a single tab task.", + Destination: &taskConfig.TabRunTimeout, + }, + &cli.DurationFlag{ + Name: "wait-dom-content-loaded-timeout", + Value: config.DomContentLoadedTimeout, + Usage: "the `Timeout` of waiting for a page dom ready.", + Destination: &taskConfig.DomContentLoadedTimeout, + }, + &cli.StringFlag{ + Name: "event-trigger-mode", + Value: config.EventTriggerAsync, + Usage: "this `Value` determines how the crawler automatically triggers events.Allowed mode:\"async\" or \"sync\".", + Destination: &taskConfig.EventTriggerMode, + }, + &cli.DurationFlag{ + Name: "event-trigger-interval", + Value: config.EventTriggerInterval, + Usage: "the `Interval` of triggering each event.", + Destination: &taskConfig.EventTriggerInterval, + }, + &cli.DurationFlag{ + Name: "before-exit-delay", + Value: config.BeforeExitDelay, + Usage: "the `Time` of waiting before crawler exit.", + Destination: &taskConfig.BeforeExitDelay, + }, + &cli.StringSliceFlag{ + Name: "ignore-url-keywords", + Aliases: []string{"iuk"}, + Value: ignoreKeywords, + Usage: "crawlergo will not crawl these URLs matched by `Keywords`. e.g.: -iuk logout -iuk quit -iuk exit", + DefaultText: "Default [logout quit exit]", + }, + &cli.StringSliceFlag{ + Name: "form-values", + Aliases: []string{"fv"}, + Value: customFormTypeValues, + Usage: "custom filling text for each form type. e.g.: -fv username=crawlergo_nice -fv password=admin123", + }, + // 根据关键词自行选择填充文本 + &cli.StringSliceFlag{ + Name: "form-keyword-values", + Aliases: []string{"fkv"}, + Value: customFormKeywordValues, + Usage: "custom filling text, fuzzy matched by keyword. e.g.: -fkv user=crawlergo_nice -fkv pass=admin123", + }, + &cli.StringFlag{ + Name: "push-to-proxy", + Usage: "every request in 'req_list' will be pushed to the proxy `Address`. Such as \"http://127.0.0.1:8080/\"", + Destination: &pushAddress, + }, + &cli.IntFlag{ + Name: "push-pool-max", + Usage: "maximum `Number` of concurrency when pushing results to proxy.", + Value: DefaultMaxPushProxyPoolMax, + Destination: &pushProxyPoolMax, + }, + &cli.StringFlag{ + Name: "log-level", + Usage: "log print `Level`, options include debug, info, warn, error and fatal.", + Value: DefaultLogLevel, + Destination: &logLevel, + }, + &cli.BoolFlag{ + Name: "no-headless", + Value: false, + Usage: "no headless mode", + Destination: &taskConfig.NoHeadless, + }, + }, + Action: run, + } + + err := app.Run(os.Args) + if err != nil { + logger.Logger.Fatal(err) + } +} + +func run(c *cli.Context) error { + var req model2.Request + signalChan = make(chan os.Signal, 1) + signal.Notify(signalChan, os.Interrupt) + + if c.Args().Len() == 0 { + logger.Logger.Error("url must be set") + return errors.New("url must be set") + } + + // 设置日志输出级别 + level, err := logrus.ParseLevel(logLevel) + if err != nil { + logger.Logger.Fatal(err) + } + logger.Logger.SetLevel(level) + + var targets []*model2.Request + for _, _url := range c.Args().Slice() { + url, err := model2.GetUrl(_url) + if err != nil { + logger.Logger.Error("parse url failed, ", err) + continue + } + if postData != "" { + req = model2.GetRequest(config.POST, url, getOption()) + } else { + req = model2.GetRequest(config.GET, url, getOption()) + } + req.Proxy = taskConfig.Proxy + targets = append(targets, &req) + } + taskConfig.IgnoreKeywords = ignoreKeywords.Value() + if taskConfig.Proxy != "" { + logger.Logger.Info("request with proxy: ", taskConfig.Proxy) + } + + if len(targets) == 0 { + logger.Logger.Fatal("no validate target.") + } + + // 检查自定义的表单参数配置 + taskConfig.CustomFormValues, err = parseCustomFormValues(customFormTypeValues.Value()) + if err != nil { + logger.Logger.Fatal(err) + } + taskConfig.CustomFormKeywordValues, err = keywordStringToMap(customFormKeywordValues.Value()) + if err != nil { + logger.Logger.Fatal(err) + } + + // 开始爬虫任务 + task, err := pkg.NewCrawlerTask(targets, taskConfig) + if err != nil { + logger.Logger.Error("create crawler task failed.") + os.Exit(-1) + } + if len(targets) != 0 { + logger.Logger.Info(fmt.Sprintf("Init crawler task, host: %s, max tab count: %d, max crawl count: %d.", + targets[0].URL.Host, taskConfig.MaxTabsCount, taskConfig.MaxCrawlCount)) + logger.Logger.Info("filter mode: ", taskConfig.FilterMode) + } + + // 提示自定义表单填充参数 + if len(taskConfig.CustomFormValues) > 0 { + logger.Logger.Info("Custom form values, " + tools.MapStringFormat(taskConfig.CustomFormValues)) + } + // 提示自定义表单填充参数 + if len(taskConfig.CustomFormKeywordValues) > 0 { + logger.Logger.Info("Custom form keyword values, " + tools.MapStringFormat(taskConfig.CustomFormKeywordValues)) + } + if _, ok := taskConfig.CustomFormValues["default"]; !ok { + logger.Logger.Info("If no matches, default form input text: " + config.DefaultInputText) + taskConfig.CustomFormValues["default"] = config.DefaultInputText + } + + go handleExit(task) + logger.Logger.Info("Start crawling.") + task.Run() + result := task.Result + + logger.Logger.Info(fmt.Sprintf("Task finished, %d results, %d requests, %d subdomains, %d domains found.", + len(result.ReqList), len(result.AllReqList), len(result.SubDomainList), len(result.AllDomainList))) + + // 内置请求代理 + if pushAddress != "" { + logger.Logger.Info("pushing results to ", pushAddress, ", max pool number:", pushProxyPoolMax) + Push2Proxy(result.ReqList) + } + + // 输出结果 + outputResult(result) + + return nil +} + +func getOption() model2.Options { + var option model2.Options + if postData != "" { + option.PostData = postData + } + if taskConfig.ExtraHeadersString != "" { + err := json.Unmarshal([]byte(taskConfig.ExtraHeadersString), &taskConfig.ExtraHeaders) + if err != nil { + logger.Logger.Fatal("custom headers can't be Unmarshal.") + panic(err) + } + option.Headers = taskConfig.ExtraHeaders + } + return option +} + +func parseCustomFormValues(customData []string) (map[string]string, error) { + parsedData := map[string]string{} + for _, item := range customData { + keyValue := strings.Split(item, "=") + if len(keyValue) < 2 { + return nil, errors.New("invalid form item: " + item) + } + key := keyValue[0] + if !tools.StringSliceContain(config.AllowedFormName, key) { + return nil, errors.New("not allowed form key: " + key) + } + value := keyValue[1] + parsedData[key] = value + } + return parsedData, nil +} + +func keywordStringToMap(data []string) (map[string]string, error) { + parsedData := map[string]string{} + for _, item := range data { + keyValue := strings.Split(item, "=") + if len(keyValue) < 2 { + return nil, errors.New("invalid keyword format: " + item) + } + key := keyValue[0] + value := keyValue[1] + parsedData[key] = value + } + return parsedData, nil +} + +func outputResult(result *pkg.Result) { + // 输出结果 + if outputMode == "json" { + fmt.Println("--[Mission Complete]--") + resBytes := getJsonSerialize(result) + fmt.Println(string(resBytes)) + } else if outputMode == "console" { + for _, req := range result.ReqList { + req.FormatPrint() + } + } + if len(outputJsonPath) != 0 { + resBytes := getJsonSerialize(result) + tools.WriteFile(outputJsonPath, resBytes) + } +} + +/** +原生被动代理推送支持 +*/ +func Push2Proxy(reqList []*model2.Request) { + pool, _ := ants.NewPool(pushProxyPoolMax) + defer pool.Release() + for _, req := range reqList { + task := ProxyTask{ + req: req, + pushProxy: pushAddress, + } + pushProxyWG.Add(1) + go func() { + err := pool.Submit(task.doRequest) + if err != nil { + logger.Logger.Error("add Push2Proxy task failed: ", err) + pushProxyWG.Done() + } + }() + } + pushProxyWG.Wait() +} + +/** +协程池请求的任务 +*/ +func (p *ProxyTask) doRequest() { + defer pushProxyWG.Done() + _, _ = requests.Request(p.req.Method, p.req.URL.String(), tools.ConvertHeaders(p.req.Headers), []byte(p.req.PostData), + &requests.ReqOptions{Timeout: 1, AllowRedirect: false, Proxy: p.pushProxy}) +} + +func handleExit(t *pkg.CrawlerTask) { + select { + case <-signalChan: + fmt.Println("exit ...") + t.Pool.Tune(1) + t.Pool.Release() + t.Browser.Close() + os.Exit(-1) + } +} + +func getJsonSerialize(result *pkg.Result) []byte { + var res Result + var reqList []Request + var allReqList []Request + for _, _req := range result.ReqList { + var req Request + req.Method = _req.Method + req.Url = _req.URL.String() + req.Source = _req.Source + req.Data = _req.PostData + req.Headers = _req.Headers + reqList = append(reqList, req) + } + for _, _req := range result.AllReqList { + var req Request + req.Method = _req.Method + req.Url = _req.URL.String() + req.Source = _req.Source + req.Data = _req.PostData + req.Headers = _req.Headers + allReqList = append(allReqList, req) + } + res.AllReqList = allReqList + res.ReqList = reqList + res.AllDomainList = result.AllDomainList + res.SubDomainList = result.SubDomainList + + resBytes, err := json.Marshal(res) + if err != nil { + log.Fatal("Marshal result error") + } + return resBytes +} diff --git a/go.mod b/go.mod new file mode 100755 index 0000000..f3b86e6 --- /dev/null +++ b/go.mod @@ -0,0 +1,17 @@ +module crawlergo + +go 1.12 + +replace git.apache.org/thrift.git => github.com/apache/thrift v0.13.0 + +require ( + github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 + github.com/chromedp/chromedp v0.5.2 + github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect + github.com/deckarep/golang-set v1.7.1 + github.com/gogf/gf v1.16.6 + github.com/panjf2000/ants/v2 v2.2.2 + github.com/pkg/errors v0.8.1 + github.com/sirupsen/logrus v1.4.2 + github.com/urfave/cli/v2 v2.0.0 +) diff --git a/pkg/config/config.go b/pkg/config/config.go new file mode 100755 index 0000000..93d9ad6 --- /dev/null +++ b/pkg/config/config.go @@ -0,0 +1,131 @@ +package config + +import "time" + +const ( + DefaultUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36" + MaxTabsCount = 10 + TabRunTimeout = 20 * time.Second + DefaultInputText = "Crawlergo" + FormInputKeyword = "Crawlergo" + SuspectURLRegex = `(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')` + URLRegex = `((https?|ftp|file):)?//[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]` + AttrURLRegex = `` + DomContentLoadedTimeout = 5 * time.Second + EventTriggerInterval = 100 * time.Millisecond // 单位毫秒 + BeforeExitDelay = 1 * time.Second + DefaultEventTriggerMode = EventTriggerAsync + MaxCrawlCount = 200 +) + +// 请求方法 +const ( + GET = "GET" + POST = "POST" + PUT = "PUT" + DELETE = "DELETE" + HEAD = "HEAD" + OPTIONS = "OPTIONS" +) + +// 过滤模式 +const ( + SimpleFilterMode = "simple" + SmartFilterMode = "smart" + StrictFilterMode = "strict" +) + +// 事件触发模式 +const ( + EventTriggerAsync = "async" + EventTriggerSync = "sync" +) + +// 请求的来源 +const ( + FromTarget = "Target" //初始输入的目标 + FromNavigation = "Navigation" //页面导航请求 + FromXHR = "XHR" //ajax异步请求 + FromDOM = "DOM" //dom解析出来的请求 + FromJSFile = "JavaScript" //JS脚本中解析 + FromFuzz = "PathFuzz" //初始path fuzz + FromRobots = "robots.txt" //robots.txt + FromComment = "Comment" //页面中的注释 + FromWebSocket = "WebSocket" + FromEventSource = "EventSource" + FromFetch = "Fetch" + FromHistoryAPI = "HistoryAPI" + FromOpenWindow = "OpenWindow" + FromHashChange = "HashChange" + FromStaticRes = "StaticResource" + FromStaticRegex = "StaticRegex" +) + +// content-type +const ( + JSON = "application/json" + URLENCODED = "application/x-www-form-urlencoded" + MULTIPART = "multipart/form-data" +) + +var StaticSuffix = []string{ + "png", "gif", "jpg", "mp4", "mp3", "mng", "pct", "bmp", "jpeg", "pst", "psp", "ttf", + "tif", "tiff", "ai", "drw", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff", + "dxf", "eps", "ps", "svg", "3gp", "asf", "asx", "avi", "mov", "mpg", "qt", "rm", + "wmv", "m4a", "bin", "xls", "xlsx", "ppt", "pptx", "doc", "docx", "odt", "ods", "odg", + "odp", "exe", "zip", "rar", "tar", "gz", "iso", "rss", "pdf", "txt", "dll", "ico", + "gz2", "apk", "crt", "woff", "map", "woff2", "webp", "less", "dmg", "bz2", "otf", "swf", + "flv", "mpeg", "dat", "xsl", "csv", "cab", "exif", "wps", "m4v", "rmvb", +} + +var ScriptSuffix = []string{ + "php", "asp", "jsp", "asa", +} + +var DefaultIgnoreKeywords = []string{"logout", "quit", "exit"} +var AllowedFormName = []string{"default", "mail", "code", "phone", "username", "password", "qq", "id_card", "url", "date", "number"} + +type ContinueResourceList []string + +var InputTextMap = map[string]map[string]interface{}{ + "mail": { + "keyword": []string{"mail"}, + "value": "crawlergo@gmail.com", + }, + "code": { + "keyword": []string{"yanzhengma", "code", "ver", "captcha"}, + "value": "123a", + }, + "phone": { + "keyword": []string{"phone", "number", "tel", "shouji"}, + "value": "18812345678", + }, + "username": { + "keyword": []string{"name", "user", "id", "login", "account"}, + "value": "crawlergo@gmail.com", + }, + "password": { + "keyword": []string{"pass", "pwd"}, + "value": "Crawlergo6.", + }, + "qq": { + "keyword": []string{"qq", "wechat", "tencent", "weixin"}, + "value": "123456789", + }, + "IDCard": { + "keyword": []string{"card", "shenfen"}, + "value": "511702197409284963", + }, + "url": { + "keyword": []string{"url", "site", "web", "blog", "link"}, + "value": "https://crawlergo.nice.cn/", + }, + "date": { + "keyword": []string{"date", "time", "year", "now"}, + "value": "2018-01-01", + }, + "number": { + "keyword": []string{"day", "age", "num", "count"}, + "value": "10", + }, +} diff --git a/pkg/domain_collect.go b/pkg/domain_collect.go new file mode 100755 index 0000000..1bebd34 --- /dev/null +++ b/pkg/domain_collect.go @@ -0,0 +1,37 @@ +package pkg + +import ( + "crawlergo/pkg/model" + mapset "github.com/deckarep/golang-set" + "strings" +) + +func SubDomainCollect(reqList []*model.Request, HostLimit string) []string { + var subDomainList []string + uniqueSet := mapset.NewSet() + for _, req := range reqList { + domain := req.URL.Hostname() + if uniqueSet.Contains(domain) { + continue + } + uniqueSet.Add(domain) + if strings.HasSuffix(domain, "."+HostLimit) { + subDomainList = append(subDomainList, domain) + } + } + return subDomainList +} + +func AllDomainCollect(reqList []*model.Request) []string { + uniqueSet := mapset.NewSet() + var allDomainList []string + for _, req := range reqList { + domain := req.URL.Hostname() + if uniqueSet.Contains(domain) { + continue + } + uniqueSet.Add(domain) + allDomainList = append(allDomainList, req.URL.Hostname()) + } + return allDomainList +} diff --git a/pkg/engine/after_dom_tasks.go b/pkg/engine/after_dom_tasks.go new file mode 100755 index 0000000..ec94281 --- /dev/null +++ b/pkg/engine/after_dom_tasks.go @@ -0,0 +1,215 @@ +package engine + +import ( + "context" + "crawlergo/pkg/config" + "crawlergo/pkg/js" + "crawlergo/pkg/logger" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" + "os" + "strings" + "time" +) + +/** +在DOMContentLoaded完成后执行 +*/ +func (tab *Tab) AfterDOMRun() { + defer tab.WG.Done() + + logger.Logger.Debug("afterDOMRun start") + + // 获取当前body节点的nodeId 用于之后查找子节点 + if !tab.getBodyNodeId() { + logger.Logger.Debug("no body document NodeID, exit.") + return + } + + tab.domWG.Add(2) + go tab.fillForm() + go tab.setObserverJS() + tab.domWG.Wait() + logger.Logger.Debug("afterDOMRun end") + tab.WG.Add(1) + go tab.AfterLoadedRun() +} + +/** +获取的Body的NodeId 用于之后子节点无等待查询 +最多等待3秒 如果DOM依旧没有渲染完成,则退出 +*/ +func (tab *Tab) getBodyNodeId() bool { + var docNodeIDs []cdp.NodeID + ctx := tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Second*3) + defer cancel() + // 获取 Frame document root + err := chromedp.NodeIDs(`body`, &docNodeIDs, chromedp.ByQuery).Do(tCtx) + if len(docNodeIDs) == 0 || err != nil { + // not root node yet? + logger.Logger.Debug("getBodyNodeId failed, maybe DOM not ready?") + if err != nil { + logger.Logger.Debug(err) + } + return false + } + tab.DocBodyNodeId = docNodeIDs[0] + return true +} + +/** +自动化填充表单 +*/ +func (tab *Tab) fillForm() { + defer tab.domWG.Done() + logger.Logger.Debug("fillForm start") + tab.fillFormWG.Add(3) + f := FillForm{ + tab: tab, + } + + go f.fillInput() + go f.fillMultiSelect() + go f.fillTextarea() + + tab.fillFormWG.Wait() + logger.Logger.Debug("fillForm end") +} + +/** +设置Dom节点变化的观察函数 +*/ +func (tab *Tab) setObserverJS() { + defer tab.domWG.Done() + logger.Logger.Debug("setObserverJS start") + // 设置Dom节点变化的观察函数 + go tab.Evaluate(js.ObserverJS) + logger.Logger.Debug("setObserverJS end") +} + +type FillForm struct { + tab *Tab +} + +/** +填充所有 input 标签 +*/ +func (f *FillForm) fillInput() { + defer f.tab.fillFormWG.Done() + var nodes []*cdp.Node + ctx := f.tab.GetExecutor() + + tCtx, cancel := context.WithTimeout(ctx, time.Second*2) + defer cancel() + // 首先判断input标签是否存在,减少等待时间 提前退出 + inputNodes, inputErr := f.tab.GetNodeIDs(`input`) + if inputErr != nil || len(inputNodes) == 0 { + logger.Logger.Debug("fillInput: get form input element err") + if inputErr != nil { + logger.Logger.Debug(inputErr) + } + return + } + // 获取所有的input标签 + err := chromedp.Nodes(`input`, &nodes, chromedp.ByQueryAll).Do(tCtx) + + if err != nil { + logger.Logger.Debug("get all input element err") + logger.Logger.Debug(err) + return + } + + // 找出 type 为空 或者 type=text + for _, node := range nodes { + // 兜底超时 + tCtxN, cancelN := context.WithTimeout(ctx, time.Second*5) + attrType := node.AttributeValue("type") + if attrType == "text" || attrType == "" { + inputName := node.AttributeValue("id") + node.AttributeValue("class") + node.AttributeValue("name") + value := f.GetMatchInputText(inputName) + var nodeIds = []cdp.NodeID{node.NodeID} + // 先使用模拟输入 + _ = chromedp.SendKeys(nodeIds, value, chromedp.ByNodeID).Do(tCtxN) + // 再直接赋值JS属性 + _ = chromedp.SetAttributeValue(nodeIds, "value", value, chromedp.ByNodeID).Do(tCtxN) + } else if attrType == "email" || attrType == "password" || attrType == "tel" { + value := f.GetMatchInputText(attrType) + var nodeIds = []cdp.NodeID{node.NodeID} + // 先使用模拟输入 + _ = chromedp.SendKeys(nodeIds, value, chromedp.ByNodeID).Do(tCtxN) + // 再直接赋值JS属性 + _ = chromedp.SetAttributeValue(nodeIds, "value", value, chromedp.ByNodeID).Do(tCtxN) + } else if attrType == "radio" || attrType == "checkbox" { + var nodeIds = []cdp.NodeID{node.NodeID} + _ = chromedp.SetAttributeValue(nodeIds, "checked", "true", chromedp.ByNodeID).Do(tCtxN) + } else if attrType == "file" || attrType == "image" { + var nodeIds = []cdp.NodeID{node.NodeID} + wd, _ := os.Getwd() + filePath := wd + "/upload/image.png" + _ = chromedp.RemoveAttribute(nodeIds, "accept", chromedp.ByNodeID).Do(tCtxN) + _ = chromedp.RemoveAttribute(nodeIds, "required", chromedp.ByNodeID).Do(tCtxN) + _ = chromedp.SendKeys(nodeIds, filePath, chromedp.ByNodeID).Do(tCtxN) + } + cancelN() + } +} + +func (f *FillForm) fillTextarea() { + defer f.tab.fillFormWG.Done() + ctx := f.tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Second*2) + defer cancel() + value := f.GetMatchInputText("other") + + textareaNodes, textareaErr := f.tab.GetNodeIDs(`textarea`) + if textareaErr != nil || len(textareaNodes) == 0 { + logger.Logger.Debug("fillTextarea: get textarea element err") + if textareaErr != nil { + logger.Logger.Debug(textareaErr) + } + return + } + + _ = chromedp.SendKeys(textareaNodes, value, chromedp.ByNodeID).Do(tCtx) +} + +func (f *FillForm) fillMultiSelect() { + defer f.tab.fillFormWG.Done() + ctx := f.tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Second*2) + defer cancel() + optionNodes, optionErr := f.tab.GetNodeIDs(`select option:first-child`) + if optionErr != nil || len(optionNodes) == 0 { + logger.Logger.Debug("fillMultiSelect: get select option element err") + if optionErr != nil { + logger.Logger.Debug(optionErr) + } + return + } + _ = chromedp.SetAttributeValue(optionNodes, "selected", "true", chromedp.ByNodeID).Do(tCtx) + _ = chromedp.SetJavascriptAttribute(optionNodes, "selected", "true", chromedp.ByNodeID).Do(tCtx) +} + +func (f *FillForm) GetMatchInputText(name string) string { + // 如果自定义了关键词,模糊匹配 + for key, value := range f.tab.config.CustomFormKeywordValues { + if strings.Contains(name, key) { + return value + } + } + + name = strings.ToLower(name) + for key, item := range config.InputTextMap { + for _, keyword := range item["keyword"].([]string) { + if strings.Contains(name, keyword) { + if customValue, ok := f.tab.config.CustomFormValues[key]; ok { + return customValue + } else { + return item["value"].(string) + } + } + } + } + return f.tab.config.CustomFormValues["default"] +} diff --git a/pkg/engine/after_loaded_tasks.go b/pkg/engine/after_loaded_tasks.go new file mode 100755 index 0000000..a6d6a46 --- /dev/null +++ b/pkg/engine/after_loaded_tasks.go @@ -0,0 +1,206 @@ +package engine + +import ( + "context" + "crawlergo/pkg/config" + "crawlergo/pkg/js" + "crawlergo/pkg/logger" + "crawlergo/pkg/tools" + "fmt" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" + "time" +) + +/** +根据NODE节点执行JS的代码 +err := EvaluateAsDevTools(snippet(submitJS, cashX(true), sel, nodes[0]), &res).Do(ctx) + +具体环境实现在 chromedp.submit 函数中 参考即可写出 +*/ + +/** +在页面Loaded之后执行 +同时等待 afterDOMRun 之后执行 +*/ +func (tab *Tab) AfterLoadedRun() { + defer tab.WG.Done() + logger.Logger.Debug("afterLoadedRun start") + tab.formSubmitWG.Add(2) + tab.loadedWG.Add(3) + tab.removeLis.Add(1) + + go tab.formSubmit() + tab.formSubmitWG.Wait() + logger.Logger.Debug("formSubmit end") + + if tab.config.EventTriggerMode == config.EventTriggerAsync { + go tab.triggerJavascriptProtocol() + go tab.triggerInlineEvents() + go tab.triggerDom2Events() + tab.loadedWG.Wait() + } else if tab.config.EventTriggerMode == config.EventTriggerSync { + tab.triggerInlineEvents() + time.Sleep(tab.config.EventTriggerInterval) + tab.triggerDom2Events() + time.Sleep(tab.config.EventTriggerInterval) + tab.triggerJavascriptProtocol() + } + + // 事件触发之后 需要等待一点时间让浏览器成功发出ajax请求 更新DOM + time.Sleep(tab.config.BeforeExitDelay) + + go tab.RemoveDOMListener() + tab.removeLis.Wait() + logger.Logger.Debug("afterLoadedRun end") +} + +/** +自动化点击提交表单 +*/ +func (tab *Tab) formSubmit() { + + logger.Logger.Debug("formSubmit start") + + // 首先对form表单设置target + tab.setFormToFrame() + + // 接下来尝试三种方式提交表单 + go tab.clickSubmit() + go tab.clickAllButton() +} + +/** +设置form的target指向一个frame +*/ +func (tab *Tab) setFormToFrame() { + // 首先新建 frame + nameStr := tools.RandSeq(8) + tab.Evaluate(fmt.Sprintf(js.NewFrameTemplate, nameStr, nameStr)) + + // 接下来将所有的 form 节点target都指向它 + ctx := tab.GetExecutor() + formNodes, formErr := tab.GetNodeIDs(`form`) + if formErr != nil || len(formNodes) == 0 { + logger.Logger.Debug("setFormToFrame: get form element err") + if formErr != nil { + logger.Logger.Debug(formErr) + } + return + } + tCtx, cancel := context.WithTimeout(ctx, time.Second*2) + defer cancel() + _ = chromedp.SetAttributeValue(formNodes, "target", nameStr, chromedp.ByNodeID).Do(tCtx) +} + +/** +点击按钮 type=submit +*/ +func (tab *Tab) clickSubmit() { + defer tab.formSubmitWG.Done() + + // 首先点击按钮 type=submit + ctx := tab.GetExecutor() + + // 获取所有的form节点 直接执行submit + formNodes, formErr := tab.GetNodeIDs(`form`) + if formErr != nil || len(formNodes) == 0 { + logger.Logger.Debug("clickSubmit: get form element err") + if formErr != nil { + logger.Logger.Debug(formErr) + } + return + } + tCtx1, cancel1 := context.WithTimeout(ctx, time.Second*2) + defer cancel1() + _ = chromedp.Submit(formNodes, chromedp.ByNodeID).Do(tCtx1) + + // 获取所有的input标签 + inputNodes, inputErr := tab.GetNodeIDs(`form input[type=submit]`) + if inputErr != nil || len(inputNodes) == 0 { + logger.Logger.Debug("clickSubmit: get form input element err") + if inputErr != nil { + logger.Logger.Debug(inputErr) + } + return + } + tCtx2, cancel2 := context.WithTimeout(ctx, time.Second*2) + defer cancel2() + _ = chromedp.Click(inputNodes, chromedp.ByNodeID).Do(tCtx2) +} + +/** +click all button +*/ +func (tab *Tab) clickAllButton() { + defer tab.formSubmitWG.Done() + + // 获取所有的form中的button节点 + ctx := tab.GetExecutor() + // 获取所有的button标签 + btnNodeIDs, bErr := tab.GetNodeIDs(`form button`) + if bErr != nil || len(btnNodeIDs) == 0 { + logger.Logger.Debug("clickAllButton: get form button element err") + if bErr != nil { + logger.Logger.Debug(bErr) + } + return + } + tCtx, cancel1 := context.WithTimeout(ctx, time.Second*2) + defer cancel1() + _ = chromedp.Click(btnNodeIDs, chromedp.ByNodeID).Do(tCtx) + + // 使用JS的click方法进行点击 + var btnNodes []*cdp.Node + tCtx2, cancel2 := context.WithTimeout(ctx, time.Second*2) + defer cancel2() + err := chromedp.Nodes(btnNodeIDs, &btnNodes, chromedp.ByNodeID).Do(tCtx2) + if err != nil { + return + } + for _, node := range btnNodes { + _ = tab.EvaluateWithNode(js.FormNodeClickJS, node) + } +} + +/** +触发内联事件 +*/ +func (tab *Tab) triggerInlineEvents() { + defer tab.loadedWG.Done() + logger.Logger.Debug("triggerInlineEvents start") + tab.Evaluate(fmt.Sprintf(js.TriggerInlineEventJS, tab.config.EventTriggerInterval.Seconds()*1000)) + logger.Logger.Debug("triggerInlineEvents end") +} + +/** +触发DOM2级事件 +*/ +func (tab *Tab) triggerDom2Events() { + defer tab.loadedWG.Done() + logger.Logger.Debug("triggerDom2Events start") + tab.Evaluate(fmt.Sprintf(js.TriggerDom2EventJS, tab.config.EventTriggerInterval.Seconds()*1000)) + logger.Logger.Debug("triggerDom2Events end") +} + +/** +a标签的href值为伪协议, +*/ +func (tab *Tab) triggerJavascriptProtocol() { + defer tab.loadedWG.Done() + logger.Logger.Debug("clickATagJavascriptProtocol start") + tab.Evaluate(fmt.Sprintf(js.TriggerJavascriptProtocol, tab.config.EventTriggerInterval.Seconds()*1000, + tab.config.EventTriggerInterval.Seconds()*1000)) + logger.Logger.Debug("clickATagJavascriptProtocol end") +} + +/** +移除DOM节点变化监听 +*/ +func (tab *Tab) RemoveDOMListener() { + defer tab.removeLis.Done() + logger.Logger.Debug("RemoveDOMListener start") + // 移除DOM节点变化监听 + tab.Evaluate(js.RemoveDOMListenerJS) + logger.Logger.Debug("RemoveDOMListener end") +} diff --git a/pkg/engine/browser.go b/pkg/engine/browser.go new file mode 100755 index 0000000..849efb5 --- /dev/null +++ b/pkg/engine/browser.go @@ -0,0 +1,110 @@ +package engine + +import ( + "context" + "crawlergo/pkg/logger" + "log" + "sync" + "time" + + "github.com/chromedp/cdproto/browser" + "github.com/chromedp/chromedp" +) + +type Browser struct { + Ctx *context.Context + Cancel *context.CancelFunc + tabs []*context.Context + tabCancels []context.CancelFunc + ExtraHeaders map[string]interface{} + lock sync.Mutex +} + +func init() { + +} + +func InitBrowser(chromiumPath string, incognito bool, extraHeaders map[string]interface{}, proxy string, noHeadless bool) *Browser { + var bro Browser + opts := append(chromedp.DefaultExecAllocatorOptions[:], + + // 执行路径 + chromedp.ExecPath(chromiumPath), + // 无头模式 + chromedp.Flag("headless", !noHeadless), + // 禁用GPU,不显示GUI + chromedp.Flag("disable-gpu", true), + // 隐身模式启动 + chromedp.Flag("incognito", incognito), + // 取消沙盒模式 + chromedp.Flag("no-sandbox", true), + // 忽略证书错误 + chromedp.Flag("ignore-certificate-errors", true), + + chromedp.Flag("disable-images", true), + // + chromedp.Flag("disable-web-security", true), + // + chromedp.Flag("disable-xss-auditor", true), + // + chromedp.Flag("disable-setuid-sandbox", true), + + chromedp.Flag("allow-running-insecure-content", true), + + chromedp.Flag("disable-webgl", true), + + chromedp.Flag("disable-popup-blocking", true), + + chromedp.WindowSize(1920, 1080), + ) + // 设置浏览器代理 + if proxy != "" { + opts = append(opts, chromedp.ProxyServer(proxy)) + } + + allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) + bctx, _ := chromedp.NewContext(allocCtx, + chromedp.WithLogf(log.Printf), + ) + // https://github.com/chromedp/chromedp/issues/824#issuecomment-845664441 + // 如果需要在一个浏览器上创建多个tab,则需要先创建浏览器的上下文,即运行下面的语句 + chromedp.Run(bctx) + bro.Cancel = &cancel + bro.Ctx = &bctx + bro.ExtraHeaders = extraHeaders + return &bro +} + +func (bro *Browser) NewTab(timeout time.Duration) (*context.Context, context.CancelFunc) { + bro.lock.Lock() + ctx, cancel := chromedp.NewContext(*bro.Ctx) + //defer cancel() + tCtx, _ := context.WithTimeout(ctx, timeout) + bro.tabs = append(bro.tabs, &tCtx) + bro.tabCancels = append(bro.tabCancels, cancel) + //defer cancel2() + bro.lock.Unlock() + + //return bro.Ctx, &cancel + return &tCtx, cancel +} + +func (bro *Browser) Close() { + logger.Logger.Info("closing browser.") + for _, cancel := range bro.tabCancels { + cancel() + } + + for _, ctx := range bro.tabs { + err := browser.Close().Do(*ctx) + if err != nil { + logger.Logger.Debug(err) + } + } + + err := browser.Close().Do(*bro.Ctx) + if err != nil { + logger.Logger.Debug(err) + } + (*bro.Cancel)() +} diff --git a/pkg/engine/collect_links.go b/pkg/engine/collect_links.go new file mode 100755 index 0000000..b371336 --- /dev/null +++ b/pkg/engine/collect_links.go @@ -0,0 +1,73 @@ +package engine + +import ( + "context" + "crawlergo/pkg/config" + "crawlergo/pkg/logger" + "fmt" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" + "regexp" + "time" +) + +/** +最后收集所有的链接 +*/ +func (tab *Tab) collectLinks() { + go tab.collectHrefLinks() + go tab.collectObjectLinks() + go tab.collectCommentLinks() +} + +func (tab *Tab) collectHrefLinks() { + defer tab.collectLinkWG.Done() + ctx := tab.GetExecutor() + // 收集 src href data-url 属性值 + attrNameList := []string{"src", "href", "data-url", "data-href"} + for _, attrName := range attrNameList { + tCtx, cancel := context.WithTimeout(ctx, time.Second*1) + var attrs []map[string]string + _ = chromedp.AttributesAll(fmt.Sprintf(`[%s]`, attrName), &attrs, chromedp.ByQueryAll).Do(tCtx) + cancel() + for _, attrMap := range attrs { + tab.AddResultUrl(config.GET, attrMap[attrName], config.FromDOM) + } + } +} + +func (tab *Tab) collectObjectLinks() { + defer tab.collectLinkWG.Done() + ctx := tab.GetExecutor() + // 收集 object[data] links + tCtx, cancel := context.WithTimeout(ctx, time.Second*1) + defer cancel() + var attrs []map[string]string + _ = chromedp.AttributesAll(`object[data]`, &attrs, chromedp.ByQueryAll).Do(tCtx) + for _, attrMap := range attrs { + tab.AddResultUrl(config.GET, attrMap["data"], config.FromDOM) + } +} + +func (tab *Tab) collectCommentLinks() { + defer tab.collectLinkWG.Done() + ctx := tab.GetExecutor() + // 收集注释中的链接 + var nodes []*cdp.Node + tCtxComment, cancel := context.WithTimeout(ctx, time.Second*1) + defer cancel() + commentErr := chromedp.Nodes(`//comment()`, &nodes, chromedp.BySearch).Do(tCtxComment) + if commentErr != nil { + logger.Logger.Debug("get comment nodes err") + logger.Logger.Debug(commentErr) + return + } + urlRegex := regexp.MustCompile(config.URLRegex) + for _, node := range nodes { + content := node.NodeValue + urlList := urlRegex.FindAllString(content, -1) + for _, url := range urlList { + tab.AddResultUrl(config.GET, url, config.FromComment) + } + } +} diff --git a/pkg/engine/intercept_request.go b/pkg/engine/intercept_request.go new file mode 100755 index 0000000..38a4f7d --- /dev/null +++ b/pkg/engine/intercept_request.go @@ -0,0 +1,296 @@ +package engine + +import ( + "bufio" + "context" + "crawlergo/pkg/config" + "crawlergo/pkg/logger" + model2 "crawlergo/pkg/model" + "crawlergo/pkg/tools" + "crawlergo/pkg/tools/requests" + "encoding/base64" + "github.com/chromedp/cdproto/fetch" + "github.com/chromedp/cdproto/network" + "io" + "net/textproto" + "regexp" + "strconv" + "strings" + "time" +) + +/** +处理每一个HTTP请求 +*/ +func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) { + defer tab.WG.Done() + ctx := tab.GetExecutor() + _req := v.Request + // 拦截到的URL格式一定正常 不处理错误 + url, err := model2.GetUrl(_req.URL, *tab.NavigateReq.URL) + if err != nil { + logger.Logger.Debug("InterceptRequest parse url failed: ", err) + _ = fetch.ContinueRequest(v.RequestID).Do(ctx) + return + } + _option := model2.Options{ + Headers: _req.Headers, + PostData: _req.PostData, + } + req := model2.GetRequest(_req.Method, url, _option) + + if IsIgnoredByKeywordMatch(req, tab.config.IgnoreKeywords) { + _ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx) + req.Source = config.FromXHR + tab.AddResultRequest(req) + return + } + + tab.HandleHostBinding(&req) + + // 静态资源 全部阻断 + for _, suffix := range config.StaticSuffix { + if strings.HasSuffix(strings.ToLower(url.Path), suffix) { + _ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx) + req.Source = config.FromStaticRes + tab.AddResultRequest(req) + return + } + } + + // 处理导航请求 + if tab.IsNavigatorRequest(v.NetworkID.String()) { + tab.NavNetworkID = v.NetworkID.String() + tab.HandleNavigationReq(&req, v) + req.Source = config.FromNavigation + tab.AddResultRequest(req) + return + } + + req.Source = config.FromXHR + tab.AddResultRequest(req) + _ = fetch.ContinueRequest(v.RequestID).Do(ctx) +} + +/** +判断是否为导航请求 +*/ +func (tab *Tab) IsNavigatorRequest(networkID string) bool { + return networkID == tab.LoaderID +} + +/** +处理 401 407 认证弹窗 +*/ +func (tab *Tab) HandleAuthRequired(req *fetch.EventAuthRequired) { + defer tab.WG.Done() + logger.Logger.Debug("auth required found, auto auth.") + ctx := tab.GetExecutor() + authRes := fetch.AuthChallengeResponse{ + Response: fetch.AuthChallengeResponseResponseProvideCredentials, + Username: "Crawlergo", + Password: "Crawlergo", + } + // 取消认证 + _ = fetch.ContinueWithAuth(req.RequestID, &authRes).Do(ctx) +} + +/** +处理导航请求 +*/ +func (tab *Tab) HandleNavigationReq(req *model2.Request, v *fetch.EventRequestPaused) { + navReq := tab.NavigateReq + ctx := tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + overrideReq := fetch.ContinueRequest(v.RequestID).WithURL(req.URL.String()) + + // 处理后端重定向请求 + if tab.FoundRedirection && tab.IsTopFrame(v.FrameID.String()) { + logger.Logger.Debug("redirect navigation req: " + req.URL.String()) + //_ = fetch.FailRequest(v.RequestID, network.ErrorReasonConnectionAborted).Do(ctx) + body := base64.StdEncoding.EncodeToString([]byte(`
Crawlergo`)) + param := fetch.FulfillRequest(v.RequestID, 200).WithBody(body) + err := param.Do(ctx) + if err != nil { + logger.Logger.Debug(err) + } + navReq.RedirectionFlag = true + navReq.Source = config.FromNavigation + tab.AddResultRequest(navReq) + // 处理重定向标记 + } else if navReq.RedirectionFlag && tab.IsTopFrame(v.FrameID.String()) { + navReq.RedirectionFlag = false + logger.Logger.Debug("has redirection_flag: " + req.URL.String()) + headers := tools.ConvertHeaders(req.Headers) + headers["Range"] = "bytes=0-1048576" + res, err := requests.Request(req.Method, req.URL.String(), headers, []byte(req.PostData), &requests.ReqOptions{ + AllowRedirect: false, Proxy: tab.config.Proxy}) + if err != nil { + logger.Logger.Debug(err) + _ = fetch.FailRequest(v.RequestID, network.ErrorReasonConnectionAborted).Do(ctx) + return + } + body := base64.StdEncoding.EncodeToString([]byte(res.Text)) + param := fetch.FulfillRequest(v.RequestID, 200).WithResponseHeaders(ConvertHeadersNoLocation(res.Header)).WithBody(body) + errR := param.Do(ctx) + if errR != nil { + logger.Logger.Debug(errR) + } + // 主导航请求 + } else if tab.IsTopFrame(v.FrameID.String()) && req.URL.NavigationUrl() == navReq.URL.NavigationUrl() { + logger.Logger.Debug("main navigation req: " + navReq.URL.String()) + // 手动设置POST信息 + if navReq.Method == config.POST || navReq.Method == config.PUT { + overrideReq = overrideReq.WithPostData(navReq.PostData) + } + overrideReq = overrideReq.WithMethod(navReq.Method) + overrideReq = overrideReq.WithHeaders(MergeHeaders(navReq.Headers, req.Headers)) + _ = overrideReq.Do(tCtx) + // 子frame的导航 + } else if !tab.IsTopFrame(v.FrameID.String()) { + _ = overrideReq.Do(tCtx) + // 前端跳转 返回204 + } else { + _ = fetch.FulfillRequest(v.RequestID, 204).Do(ctx) + } +} + +/** +处理Host绑定 +*/ +func (tab *Tab) HandleHostBinding(req *model2.Request) { + url := req.URL + navUrl := tab.NavigateReq.URL + // 导航请求的域名和HOST绑定中的域名不同,且当前请求的domain和导航请求header中的Host相同,则替换当前请求的domain并绑定Host + if host, ok := tab.NavigateReq.Headers["Host"]; ok { + if navUrl.Hostname() != host && url.Host == host { + urlObj, _ := model2.GetUrl(strings.Replace(req.URL.String(), "://"+url.Hostname(), "://"+navUrl.Hostname(), -1), *navUrl) + req.URL = urlObj + req.Headers["Host"] = host + + } else if navUrl.Hostname() != host && url.Host == navUrl.Host { + req.Headers["Host"] = host + } + // 修正Origin + if _, ok := req.Headers["Origin"]; ok { + req.Headers["Origin"] = strings.Replace(req.Headers["Origin"].(string), navUrl.Host, host.(string), 1) + } + // 修正Referer + if _, ok := req.Headers["Referer"]; ok { + req.Headers["Referer"] = strings.Replace(req.Headers["Referer"].(string), navUrl.Host, host.(string), 1) + } else { + req.Headers["Referer"] = strings.Replace(navUrl.String(), navUrl.Host, host.(string), 1) + } + } +} + +func (tab *Tab) IsTopFrame(FrameID string) bool { + return FrameID == tab.TopFrameId +} + +/** +解析响应内容中的URL 使用正则匹配 +*/ +func (tab *Tab) ParseResponseURL(v *network.EventResponseReceived) { + defer tab.WG.Done() + ctx := tab.GetExecutor() + res, err := network.GetResponseBody(v.RequestID).Do(ctx) + if err != nil { + logger.Logger.Debug("ParseResponseURL ", err) + return + } + resStr := string(res) + + urlRegex := regexp.MustCompile(config.SuspectURLRegex) + urlList := urlRegex.FindAllString(resStr, -1) + for _, url := range urlList { + + url = url[1 : len(url)-1] + url_lower := strings.ToLower(url) + if strings.HasPrefix(url_lower, "image/x-icon") || strings.HasPrefix(url_lower, "text/css") || strings.HasPrefix(url_lower, "text/javascript") { + continue + } + + tab.AddResultUrl(config.GET, url, config.FromJSFile) + } +} + +func (tab *Tab) HandleRedirectionResp(v *network.EventResponseReceivedExtraInfo) { + defer tab.WG.Done() + statusCode := tab.GetStatusCode(v.HeadersText) + // 导航请求,且返回重定向 + if 300 <= statusCode && statusCode < 400 { + logger.Logger.Debug("set redirect flag.") + tab.FoundRedirection = true + } +} + +func (tab *Tab) GetContentCharset(v *network.EventResponseReceived) { + defer tab.WG.Done() + var getCharsetRegex = regexp.MustCompile("charset=(.+)$") + for key, value := range v.Response.Headers { + if key == "Content-Type" { + value := value.(string) + if strings.Contains(value, "charset") { + value = getCharsetRegex.FindString(value) + value = strings.ToUpper(strings.Replace(value, "charset=", "", -1)) + tab.PageCharset = value + tab.PageCharset = strings.TrimSpace(tab.PageCharset) + } + } + } +} + +func (tab *Tab) GetStatusCode(headerText string) int { + rspInput := strings.NewReader(headerText) + rspBuf := bufio.NewReader(rspInput) + tp := textproto.NewReader(rspBuf) + line, err := tp.ReadLine() + if err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return 0 + } + parts := strings.Split(line, " ") + if len(parts) < 3 { + return 0 + } + code, _ := strconv.Atoi(parts[1]) + return code +} + +func MergeHeaders(navHeaders map[string]interface{}, headers map[string]interface{}) []*fetch.HeaderEntry { + var mergedHeaders []*fetch.HeaderEntry + for key, value := range navHeaders { + if _, ok := headers[key]; !ok { + var header fetch.HeaderEntry + header.Name = key + header.Value = value.(string) + mergedHeaders = append(mergedHeaders, &header) + } + } + + for key, value := range headers { + var header fetch.HeaderEntry + header.Name = key + header.Value = value.(string) + mergedHeaders = append(mergedHeaders, &header) + } + return mergedHeaders +} + +func ConvertHeadersNoLocation(h map[string][]string) []*fetch.HeaderEntry { + var headers []*fetch.HeaderEntry + for key, value := range h { + if key == "Location" { + continue + } + var header fetch.HeaderEntry + header.Name = key + header.Value = value[0] + headers = append(headers, &header) + } + return headers +} diff --git a/pkg/engine/tab.go b/pkg/engine/tab.go new file mode 100755 index 0000000..4a2890d --- /dev/null +++ b/pkg/engine/tab.go @@ -0,0 +1,443 @@ +package engine + +import ( + "context" + "crawlergo/pkg/config" + "crawlergo/pkg/js" + "crawlergo/pkg/logger" + model2 "crawlergo/pkg/model" + "encoding/json" + "fmt" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/cdproto/dom" + "github.com/chromedp/cdproto/fetch" + "github.com/chromedp/cdproto/network" + "github.com/chromedp/cdproto/page" + "github.com/chromedp/cdproto/runtime" + "github.com/chromedp/chromedp" + "github.com/gogf/gf/encoding/gcharset" + "regexp" + "strings" + "sync" + "time" +) + +type Tab struct { + Ctx *context.Context + Cancel context.CancelFunc + NavigateReq model2.Request + ExtraHeaders map[string]interface{} + ResultList []*model2.Request + TopFrameId string + LoaderID string + NavNetworkID string + PageCharset string + PageBindings map[string]interface{} + NavDone chan int + FoundRedirection bool + DocBodyNodeId cdp.NodeID + config TabConfig + + lock sync.Mutex + + WG sync.WaitGroup //当前Tab页的等待同步计数 + collectLinkWG sync.WaitGroup + loadedWG sync.WaitGroup //Loaded之后的等待计数 + formSubmitWG sync.WaitGroup //表单提交完毕的等待计数 + removeLis sync.WaitGroup //移除事件监听 + domWG sync.WaitGroup //DOMContentLoaded 的等待计数 + fillFormWG sync.WaitGroup //填充表单任务 +} + +type TabConfig struct { + TabRunTimeout time.Duration + DomContentLoadedTimeout time.Duration + EventTriggerMode string // 事件触发的调用方式: 异步 或 顺序 + EventTriggerInterval time.Duration // 事件触发的间隔 单位毫秒 + BeforeExitDelay time.Duration // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获 + EncodeURLWithCharset bool + IgnoreKeywords []string // + Proxy string + CustomFormValues map[string]string + CustomFormKeywordValues map[string]string +} + +type bindingCallPayload struct { + Name string `json:"name"` + Seq int `json:"seq"` + Args []string `json:"args"` +} + +func NewTab(browser *Browser, navigateReq model2.Request, config TabConfig) *Tab { + var tab Tab + tab.ExtraHeaders = map[string]interface{}{} + var DOMContentLoadedRun = false + tab.Ctx, tab.Cancel = browser.NewTab(config.TabRunTimeout) + for key, value := range browser.ExtraHeaders { + navigateReq.Headers[key] = value + if key != "Host" { + tab.ExtraHeaders[key] = value + } + } + tab.NavigateReq = navigateReq + tab.config = config + tab.NavDone = make(chan int) + tab.DocBodyNodeId = 0 + + // 设置请求拦截监听 + chromedp.ListenTarget(*tab.Ctx, func(v interface{}) { + switch v := v.(type) { + // 根据不同的事件 选择执行对应的动作 + case *network.EventRequestWillBeSent: + if string(v.RequestID) == string(v.LoaderID) && v.Type == "Document" && tab.TopFrameId == "" { + tab.LoaderID = string(v.LoaderID) + tab.TopFrameId = string(v.FrameID) + } + + // 请求发出时暂停 即 请求拦截 + case *fetch.EventRequestPaused: + tab.WG.Add(1) + go tab.InterceptRequest(v) + + // 解析所有JS文件中的URL并添加到结果中 + // 解析HTML文档中的URL + // 查找当前页面的编码 + case *network.EventResponseReceived: + if v.Response.MimeType == "application/javascript" || v.Response.MimeType == "text/html" || v.Response.MimeType == "application/json" { + tab.WG.Add(1) + go tab.ParseResponseURL(v) + } + if v.RequestID.String() == tab.NavNetworkID { + tab.WG.Add(1) + go tab.GetContentCharset(v) + } + // 处理后端重定向 3XX + case *network.EventResponseReceivedExtraInfo: + if v.RequestID.String() == tab.NavNetworkID { + tab.WG.Add(1) + go tab.HandleRedirectionResp(v) + } + //case *network.EventLoadingFailed: + // logger.Logger.Error("EventLoadingFailed ", v.ErrorText) + // 401 407 要求认证 此时会阻塞当前页面 需要处理解决 + case *fetch.EventAuthRequired: + tab.WG.Add(1) + go tab.HandleAuthRequired(v) + + // DOMContentLoaded + // 开始执行表单填充 和 执行DOM节点观察函数 + // 只执行一次 + case *page.EventDomContentEventFired: + if DOMContentLoadedRun { + return + } + DOMContentLoadedRun = true + tab.WG.Add(1) + go tab.AfterDOMRun() + // Loaded + case *page.EventLoadEventFired: + if DOMContentLoadedRun { + return + } + DOMContentLoadedRun = true + tab.WG.Add(1) + go tab.AfterDOMRun() + + // close Dialog + case *page.EventJavascriptDialogOpening: + tab.WG.Add(1) + go tab.dismissDialog() + + // handle expose function + case *runtime.EventBindingCalled: + tab.WG.Add(1) + go tab.HandleBindingCalled(v) + } + }) + + return &tab +} + +/** + + */ +func waitNavigateDone(ctx context.Context) error { + ch := make(chan struct{}) + lCtx, lCancel := context.WithCancel(ctx) + tCtx, cancel := context.WithTimeout(ctx, config.DomContentLoadedTimeout) + defer cancel() + chromedp.ListenTarget(lCtx, func(ev interface{}) { + if _, ok := ev.(*page.EventDomContentEventFired); ok { + lCancel() + close(ch) + } else if _, ok := ev.(*page.EventLoadEventFired); ok { + lCancel() + close(ch) + } + }) + select { + case <-ch: + return nil + case <-ctx.Done(): + return ctx.Err() + case <-tCtx.Done(): + return tCtx.Err() + } +} + +func (tab *Tab) Start() { + logger.Logger.Info("Crawling " + tab.NavigateReq.Method + " " + tab.NavigateReq.URL.String()) + defer tab.Cancel() + if err := chromedp.Run(*tab.Ctx, + RunWithTimeOut(tab.Ctx, tab.config.DomContentLoadedTimeout, chromedp.Tasks{ + // + runtime.Enable(), + // 开启网络层API + network.Enable(), + // 开启请求拦截API + fetch.Enable().WithHandleAuthRequests(true), + // 添加回调函数绑定 + // XSS-Scan 使用的回调 + runtime.AddBinding("addLink"), + runtime.AddBinding("Test"), + // 初始化执行JS + chromedp.ActionFunc(func(ctx context.Context) error { + var err error + _, err = page.AddScriptToEvaluateOnNewDocument(js.TabInitJS).Do(ctx) + if err != nil { + return err + } + return nil + }), + network.SetExtraHTTPHeaders(tab.ExtraHeaders), + // 执行导航 + //chromedp.Navigate(tab.NavigateReq.URL.String()), + chromedp.ActionFunc(func(ctx context.Context) error { + _, _, _, err := page.Navigate(tab.NavigateReq.URL.String()).Do(ctx) + if err != nil { + return err + } + return waitNavigateDone(ctx) + }), + }), + ); err != nil { + if err.Error() == "context canceled" { + return + } + logger.Logger.Warn("navigate timeout ", tab.NavigateReq.URL.String()) + } + + go func() { + // 等待所有协程任务结束 + tab.WG.Wait() + tab.NavDone <- 1 + }() + + select { + case <-tab.NavDone: + logger.Logger.Debug("all navigation tasks done.") + case <-time.After(tab.config.DomContentLoadedTimeout + time.Second*10): + logger.Logger.Warn("navigation tasks TIMEOUT.") + } + + // 等待收集所有链接 + logger.Logger.Debug("collectLinks start.") + tab.collectLinkWG.Add(3) + go tab.collectLinks() + tab.collectLinkWG.Wait() + logger.Logger.Debug("collectLinks end.") + + // 识别页面编码 并编码所有URL + if tab.config.EncodeURLWithCharset { + tab.DetectCharset() + tab.EncodeAllURLWithCharset() + } + + //fmt.Println(tab.NavigateReq.URL.String(), len(tab.ResultList)) + //for _, v := range tab.ResultList { + // v.SimplePrint() + //} + // fmt.Println("Finished " + tab.NavigateReq.Method + " " + tab.NavigateReq.URL.String()) +} + +func RunWithTimeOut(ctx *context.Context, timeout time.Duration, tasks chromedp.Tasks) chromedp.ActionFunc { + return func(ctx context.Context) error { + timeoutContext, _ := context.WithTimeout(ctx, timeout) + //defer cancel() + return tasks.Do(timeoutContext) + } +} + +/** +添加收集到的URL到结果列表,需要处理Host绑定 +*/ +func (tab *Tab) AddResultUrl(method string, _url string, source string) { + navUrl := tab.NavigateReq.URL + url, err := model2.GetUrl(_url, *navUrl) + if err != nil { + return + } + option := model2.Options{ + Headers: map[string]interface{}{}, + PostData: "", + } + referer := navUrl.String() + + // 处理Host绑定 + if host, ok := tab.NavigateReq.Headers["Host"]; ok { + if host != navUrl.Hostname() && url.Hostname() == host { + url, _ = model2.GetUrl(strings.Replace(url.String(), "://"+url.Hostname(), "://"+navUrl.Hostname(), -1), *navUrl) + option.Headers["Host"] = host + referer = strings.Replace(navUrl.String(), navUrl.Host, host.(string), -1) + } + } + // 添加Cookie + if cookie, ok := tab.NavigateReq.Headers["Cookie"]; ok { + option.Headers["Cookie"] = cookie + } + + // 修正Referer + option.Headers["Referer"] = referer + for key, value := range tab.ExtraHeaders { + option.Headers[key] = value + } + req := model2.GetRequest(method, url, option) + req.Source = source + + tab.lock.Lock() + tab.ResultList = append(tab.ResultList, &req) + tab.lock.Unlock() +} + +/** +添加请求到结果列表,拦截请求时处理了Host绑定,此处无需处理 +*/ +func (tab *Tab) AddResultRequest(req model2.Request) { + for key, value := range tab.ExtraHeaders { + req.Headers[key] = value + } + tab.lock.Lock() + tab.ResultList = append(tab.ResultList, &req) + tab.lock.Unlock() +} + +/** +获取当前标签页CDP的执行上下文 +*/ +func (tab *Tab) GetExecutor() context.Context { + c := chromedp.FromContext(*tab.Ctx) + ctx := cdp.WithExecutor(*tab.Ctx, c.Target) + return ctx +} + +/** +关闭弹窗 +*/ +func (tab *Tab) dismissDialog() { + defer tab.WG.Done() + ctx := tab.GetExecutor() + _ = page.HandleJavaScriptDialog(false).Do(ctx) +} + +/** +处理回调 +*/ +func (tab *Tab) HandleBindingCalled(event *runtime.EventBindingCalled) { + defer tab.WG.Done() + payload := []byte(event.Payload) + var bcPayload bindingCallPayload + _ = json.Unmarshal(payload, &bcPayload) + if bcPayload.Name == "addLink" && len(bcPayload.Args) > 1 { + tab.AddResultUrl(config.GET, bcPayload.Args[0], bcPayload.Args[1]) + } + if bcPayload.Name == "Test" { + fmt.Println(bcPayload.Args) + } + tab.Evaluate(fmt.Sprintf(js.DeliverResultJS, bcPayload.Name, bcPayload.Seq, "s")) +} + +/** +执行JS +*/ +func (tab *Tab) Evaluate(expression string) { + ctx := tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + _, exception, err := runtime.Evaluate(expression).Do(tCtx) + if exception != nil { + logger.Logger.Debug("tab Evaluate: ", exception.Text) + } + if err != nil { + logger.Logger.Debug("tab Evaluate: ", err) + } +} + +/** +立即根据条件获取Nodes的ID,不等待 +*/ +func (tab *Tab) GetNodeIDs(sel string) ([]cdp.NodeID, error) { + ctx := tab.GetExecutor() + return dom.QuerySelectorAll(tab.DocBodyNodeId, sel).Do(ctx) +} + +/** +根据给的Node执行JS +*/ +func (tab *Tab) EvaluateWithNode(expression string, node *cdp.Node) error { + ctx := tab.GetExecutor() + var res bool + err := chromedp.EvaluateAsDevTools(js.Snippet(expression, js.CashX(true), "", node), &res).Do(ctx) + if err != nil { + return err + } + return nil +} + +/** +识别页面的编码 +*/ +func (tab *Tab) DetectCharset() { + ctx := tab.GetExecutor() + tCtx, cancel := context.WithTimeout(ctx, time.Millisecond*500) + defer cancel() + var content string + var ok bool + var getCharsetRegex = regexp.MustCompile("charset=(.+)$") + err := chromedp.AttributeValue(`meta[http-equiv=Content-Type]`, "content", &content, &ok, chromedp.ByQuery).Do(tCtx) + if err != nil || ok != true { + return + } + if strings.Contains(content, "charset=") { + charset := getCharsetRegex.FindString(content) + if charset != "" { + tab.PageCharset = strings.ToUpper(strings.Replace(charset, "charset=", "", -1)) + tab.PageCharset = strings.TrimSpace(tab.PageCharset) + } + } +} + +func (tab *Tab) EncodeAllURLWithCharset() { + if tab.PageCharset == "" || tab.PageCharset == "UTF-8" { + return + } + for _, req := range tab.ResultList { + newRawQuery, err := gcharset.UTF8To(tab.PageCharset, req.URL.RawQuery) + if err == nil { + req.URL.RawQuery = newRawQuery + } + newRawPath, err := gcharset.UTF8To(tab.PageCharset, req.URL.RawPath) + if err == nil { + req.URL.RawPath = newRawPath + } + } +} + +func IsIgnoredByKeywordMatch(req model2.Request, IgnoreKeywords []string) bool { + for _, _str := range IgnoreKeywords { + if strings.Contains(req.URL.String(), _str) { + logger.Logger.Info("ignore request: ", req.SimpleFormat()) + return true + } + } + return false +} diff --git a/pkg/filter/simple_filter.go b/pkg/filter/simple_filter.go new file mode 100755 index 0000000..71cde30 --- /dev/null +++ b/pkg/filter/simple_filter.go @@ -0,0 +1,96 @@ +package filter + +import ( + "crawlergo/pkg/config" + "crawlergo/pkg/model" + "github.com/deckarep/golang-set" + "strings" +) + +type SimpleFilter struct { + UniqueSet mapset.Set + HostLimit string +} + +/** +需要过滤则返回 true +*/ +func (s *SimpleFilter) DoFilter(req *model.Request) bool { + if s.UniqueSet == nil { + s.UniqueSet = mapset.NewSet() + } + // 首先判断是否需要过滤域名 + if s.HostLimit != "" && s.DomainFilter(req) { + return true + } + // 去重 + if s.UniqueFilter(req) { + return true + } + // 过滤静态资源 + if s.StaticFilter(req) { + return true + } + return false +} + +/** +请求去重 +*/ +func (s *SimpleFilter) UniqueFilter(req *model.Request) bool { + if s.UniqueSet == nil { + s.UniqueSet = mapset.NewSet() + } + if s.UniqueSet.Contains(req.UniqueId()) { + return true + } else { + s.UniqueSet.Add(req.UniqueId()) + return false + } +} + +/** +静态资源过滤 +*/ +func (s *SimpleFilter) StaticFilter(req *model.Request) bool { + if s.UniqueSet == nil { + s.UniqueSet = mapset.NewSet() + } + // 首先将slice转换成map + extMap := map[string]int{} + staticSuffix := append(config.StaticSuffix, "js", "css", "json") + for _, suffix := range staticSuffix { + extMap[suffix] = 1 + } + + if req.URL.FileExt() == "" { + return false + } + if _, ok := extMap[req.URL.FileExt()]; ok { + return true + } + return false +} + +/** +只保留指定域名的链接 +*/ +func (s *SimpleFilter) DomainFilter(req *model.Request) bool { + if s.UniqueSet == nil { + s.UniqueSet = mapset.NewSet() + } + if req.URL.Host == s.HostLimit || req.URL.Hostname() == s.HostLimit { + return false + } + if strings.HasSuffix(s.HostLimit, ":80") && req.URL.Port() == "" && req.URL.Scheme == "http" { + if req.URL.Hostname()+":80" == s.HostLimit { + return false + } + } + if strings.HasSuffix(s.HostLimit, ":443") && req.URL.Port() == "" && req.URL.Scheme == "https" { + if req.URL.Hostname()+":443" == s.HostLimit { + return false + } + } + return true +} diff --git a/pkg/filter/smart_filter.go b/pkg/filter/smart_filter.go new file mode 100755 index 0000000..f1ae700 --- /dev/null +++ b/pkg/filter/smart_filter.go @@ -0,0 +1,646 @@ +package filter + +import ( + "crawlergo/pkg/config" + "crawlergo/pkg/logger" + "crawlergo/pkg/model" + "crawlergo/pkg/tools" + "go/types" + "regexp" + "sort" + "strings" + "sync" + + mapset "github.com/deckarep/golang-set" +) + +type SmartFilter struct { + StrictMode bool + SimpleFilter SimpleFilter + filterLocationSet mapset.Set // 非逻辑型参数的位置记录 全局统一标记过滤 + filterParamKeyRepeatCount sync.Map + filterParamKeySingleValues sync.Map // 所有参数名重复数量统计 + filterPathParamKeySymbol sync.Map // 某个path下的某个参数的值出现标记次数统计 + filterParamKeyAllValues sync.Map + filterPathParamEmptyValues sync.Map + filterParentPathValues sync.Map + uniqueMarkedIds mapset.Set // 标记后的唯一ID,用于去重 +} + +const ( + MaxParentPathCount = 32 // 相对于上一级目录,本级path目录的数量修正最大值 + MaxParamKeySingleCount = 8 // 某个URL参数名重复修正最大值 + MaxParamKeyAllCount = 10 // 本轮所有URL中某个参数名的重复修正最大值 + MaxPathParamEmptyCount = 10 // 某个path下的参数值为空,参数名个数修正最大值 + MaxPathParamKeySymbolCount = 5 // 某个Path下的某个参数的标记数量超过此值,则该参数被全局标记 +) + +const ( + CustomValueMark = "{{Crawlergo}}" + FixParamRepeatMark = "{{fix_param}}" + FixPathMark = "{{fix_path}}" + TooLongMark = "{{long}}" + NumberMark = "{{number}}" + ChineseMark = "{{chinese}}" + UpperMark = "{{upper}}" + LowerMark = "{{lower}}" + UrlEncodeMark = "{{urlencode}}" + UnicodeMark = "{{unicode}}" + BoolMark = "{{bool}}" + ListMark = "{{list}}" + TimeMark = "{{time}}" + MixAlphaNumMark = "{{mix_alpha_num}}" + MixSymbolMark = "{{mix_symbol}}" + MixNumMark = "{{mix_num}}" + NoLowerAlphaMark = "{{no_lower}}" + MixStringMark = "{{mix_str}}" +) + +var chineseRegex = regexp.MustCompile("[\u4e00-\u9fa5]+") +var urlencodeRegex = regexp.MustCompile("(?:%[A-Fa-f0-9]{2,6})+") +var unicodeRegex = regexp.MustCompile(`(?:\\u\w{4})+`) +var onlyAlphaRegex = regexp.MustCompile("^[a-zA-Z]+$") +var onlyAlphaUpperRegex = regexp.MustCompile("^[A-Z]+$") +var alphaUpperRegex = regexp.MustCompile("[A-Z]+") +var alphaLowerRegex = regexp.MustCompile("[a-z]+") +var replaceNumRegex = regexp.MustCompile(`[0-9]+\.[0-9]+|\d+`) +var onlyNumberRegex = regexp.MustCompile(`^[0-9]+$`) +var numberRegex = regexp.MustCompile(`[0-9]+`) +var OneNumberRegex = regexp.MustCompile(`[0-9]`) +var numSymbolRegex = regexp.MustCompile(`\.|_|-`) +var timeSymbolRegex = regexp.MustCompile(`-|:|\s`) +var onlyAlphaNumRegex = regexp.MustCompile(`^[0-9a-zA-Z]+$`) +var markedStringRegex = regexp.MustCompile(`^{{.+}}$`) +var htmlReplaceRegex = regexp.MustCompile(`\.shtml|\.html|\.htm`) + +func (s *SmartFilter) Init() { + s.filterLocationSet = mapset.NewSet() + s.filterParamKeyRepeatCount = sync.Map{} + s.filterParamKeySingleValues = sync.Map{} + s.filterPathParamKeySymbol = sync.Map{} + s.filterParamKeyAllValues = sync.Map{} + s.filterPathParamEmptyValues = sync.Map{} + s.filterParentPathValues = sync.Map{} + s.uniqueMarkedIds = mapset.NewSet() +} + +/** +智能去重 +可选严格模式 + +需要过滤则返回 true +*/ +func (s *SmartFilter) DoFilter(req *model.Request) bool { + // 首先过滤掉静态资源、基础的去重、过滤其它的域名 + if s.SimpleFilter.DoFilter(req) { + logger.Logger.Debugf("filter req by simplefilter: " + req.URL.RequestURI()) + return true + } + + // 标记 + if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { + s.getMark(req) + } else if req.Method == config.POST || req.Method == config.PUT { + s.postMark(req) + } else { + logger.Logger.Debug("dont support such method: " + req.Method) + } + + if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { + s.repeatCountStatistic(req) + } + + // 对标记后的请求进行去重 + uniqueId := req.Filter.UniqueId + if s.uniqueMarkedIds.Contains(uniqueId) { + logger.Logger.Debugf("filter req by uniqueMarkedIds 1: " + req.URL.RequestURI()) + return true + } + + // 全局数值型参数标记 + s.globalFilterLocationMark(req) + + // 接下来对标记的GET请求进行去重 + if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { + // 对超过阈值的GET请求进行标记 + s.overCountMark(req) + + // 重新计算 QueryMapId + req.Filter.QueryMapId = s.getParamMapID(req.Filter.MarkedQueryMap) + // 重新计算 PathId + req.Filter.PathId = s.getPathID(req.Filter.MarkedPath) + } else { + // 重新计算 PostDataId + req.Filter.PostDataId = s.getParamMapID(req.Filter.MarkedPostDataMap) + } + + // 重新计算请求唯一ID + req.Filter.UniqueId = s.getMarkedUniqueID(req) + + // 新的ID再次去重 + newUniqueId := req.Filter.UniqueId + if s.uniqueMarkedIds.Contains(newUniqueId) { + logger.Logger.Debugf("filter req by uniqueMarkedIds 2: " + req.URL.RequestURI()) + return true + } + + // 添加到结果集中 + s.uniqueMarkedIds.Add(newUniqueId) + return false +} + +/** +Query的Map对象会自动解码,所以对RawQuery进行预先的标记 +*/ +func (s *SmartFilter) preQueryMark(rawQuery string) string { + if chineseRegex.MatchString(rawQuery) { + return chineseRegex.ReplaceAllString(rawQuery, ChineseMark) + } else if urlencodeRegex.MatchString(rawQuery) { + return urlencodeRegex.ReplaceAllString(rawQuery, UrlEncodeMark) + } else if unicodeRegex.MatchString(rawQuery) { + return unicodeRegex.ReplaceAllString(rawQuery, UnicodeMark) + } + return rawQuery +} + +/** +对GET请求的参数和路径进行标记 +*/ +func (s *SmartFilter) getMark(req *model.Request) { + // 首先是解码前的预先替换 + todoURL := *(req.URL) + todoURL.RawQuery = s.preQueryMark(todoURL.RawQuery) + + // 依次打标记 + queryMap := todoURL.QueryMap() + queryMap = s.markParamName(queryMap) + queryMap = s.markParamValue(queryMap, *req) + markedPath := s.MarkPath(todoURL.Path) + + // 计算唯一的ID + var queryKeyID string + var queryMapID string + if len(queryMap) != 0 { + queryKeyID = s.getKeysID(queryMap) + queryMapID = s.getParamMapID(queryMap) + } else { + queryKeyID = "" + queryMapID = "" + } + pathID := s.getPathID(markedPath) + + req.Filter.MarkedQueryMap = queryMap + req.Filter.QueryKeysId = queryKeyID + req.Filter.QueryMapId = queryMapID + req.Filter.MarkedPath = markedPath + req.Filter.PathId = pathID + + // 最后计算标记后的唯一请求ID + req.Filter.UniqueId = s.getMarkedUniqueID(req) +} + +/** +对POST请求的参数和路径进行标记 +*/ +func (s *SmartFilter) postMark(req *model.Request) { + postDataMap := req.PostDataMap() + + postDataMap = s.markParamName(postDataMap) + postDataMap = s.markParamValue(postDataMap, *req) + markedPath := s.MarkPath(req.URL.Path) + + // 计算唯一的ID + var postDataMapID string + if len(postDataMap) != 0 { + postDataMapID = s.getParamMapID(postDataMap) + } else { + postDataMapID = "" + } + pathID := s.getPathID(markedPath) + + req.Filter.MarkedPostDataMap = postDataMap + req.Filter.PostDataId = postDataMapID + req.Filter.MarkedPath = markedPath + req.Filter.PathId = pathID + + // 最后计算标记后的唯一请求ID + req.Filter.UniqueId = s.getMarkedUniqueID(req) +} + +/** +标记参数名 +*/ +func (s *SmartFilter) markParamName(paramMap map[string]interface{}) map[string]interface{} { + markedParamMap := map[string]interface{}{} + for key, value := range paramMap { + // 纯字母不处理 + if onlyAlphaRegex.MatchString(key) { + markedParamMap[key] = value + // 参数名过长 + } else if len(key) >= 32 { + markedParamMap[TooLongMark] = value + // 替换掉数字 + } else { + key = replaceNumRegex.ReplaceAllString(key, NumberMark) + markedParamMap[key] = value + } + } + return markedParamMap +} + +/** +标记参数值 +*/ +func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model.Request) map[string]interface{} { + markedParamMap := map[string]interface{}{} + for key, value := range paramMap { + switch value.(type) { + case bool: + markedParamMap[key] = BoolMark + continue + case types.Slice: + markedParamMap[key] = ListMark + continue + case float64: + markedParamMap[key] = NumberMark + continue + } + // 只处理string类型 + valueStr, ok := value.(string) + if !ok { + continue + } + // Crawlergo 为特定字符,说明此参数位置为数值型,非逻辑型,记录下此参数,全局过滤 + if strings.Contains(valueStr, "Crawlergo") { + name := req.URL.Hostname() + req.URL.Path + req.Method + key + s.filterLocationSet.Add(name) + markedParamMap[key] = CustomValueMark + // 全大写字母 + } else if onlyAlphaUpperRegex.MatchString(valueStr) { + markedParamMap[key] = UpperMark + // 参数值长度大于等于16 + } else if len(valueStr) >= 16 { + markedParamMap[key] = TooLongMark + // 均为数字和一些符号组成 + } else if onlyNumberRegex.MatchString(valueStr) || onlyNumberRegex.MatchString(numSymbolRegex.ReplaceAllString(valueStr, "")) { + markedParamMap[key] = NumberMark + // 存在中文 + } else if chineseRegex.MatchString(valueStr) { + markedParamMap[key] = ChineseMark + // urlencode + } else if urlencodeRegex.MatchString(valueStr) { + markedParamMap[key] = UrlEncodeMark + // unicode + } else if unicodeRegex.MatchString(valueStr) { + markedParamMap[key] = UnicodeMark + // 时间 + } else if onlyNumberRegex.MatchString(timeSymbolRegex.ReplaceAllString(valueStr, "")) { + markedParamMap[key] = TimeMark + // 字母加数字 + } else if onlyAlphaNumRegex.MatchString(valueStr) && numberRegex.MatchString(valueStr) { + markedParamMap[key] = MixAlphaNumMark + // 含有一些特殊符号 + } else if s.hasSpecialSymbol(valueStr) { + markedParamMap[key] = MixSymbolMark + // 数字出现的次数超过3,视为数值型参数 + } else if b := OneNumberRegex.ReplaceAllString(valueStr, "0"); strings.Count(b, "0") >= 3 { + markedParamMap[key] = MixNumMark + // 严格模式 + } else if s.StrictMode { + // 无小写字母 + if !alphaLowerRegex.MatchString(valueStr) { + markedParamMap[key] = NoLowerAlphaMark + // 常见的值一般为 大写字母、小写字母、数字、下划线的任意组合,组合类型超过三种则视为伪静态 + } else { + count := 0 + if alphaLowerRegex.MatchString(valueStr) { + count += 1 + } + if alphaUpperRegex.MatchString(valueStr) { + count += 1 + } + if numberRegex.MatchString(valueStr) { + count += 1 + } + if strings.Contains(valueStr, "_") || strings.Contains(valueStr, "-") { + count += 1 + } + if count >= 3 { + markedParamMap[key] = MixStringMark + } + } + } else { + markedParamMap[key] = value + } + } + return markedParamMap +} + +/** +标记路径 +*/ +func (s *SmartFilter) MarkPath(path string) string { + pathParts := strings.Split(path, "/") + for index, part := range pathParts { + if len(part) >= 32 { + pathParts[index] = TooLongMark + } else if onlyNumberRegex.MatchString(numSymbolRegex.ReplaceAllString(part, "")) { + pathParts[index] = NumberMark + } else if strings.HasSuffix(part, ".html") || strings.HasSuffix(part, ".htm") || strings.HasSuffix(part, ".shtml") { + part = htmlReplaceRegex.ReplaceAllString(part, "") + // 大写、小写、数字混合 + if numberRegex.MatchString(part) && alphaUpperRegex.MatchString(part) && alphaLowerRegex.MatchString(part) { + pathParts[index] = MixAlphaNumMark + // 纯数字 + } else if b := numSymbolRegex.ReplaceAllString(part, ""); onlyNumberRegex.MatchString(b) { + pathParts[index] = NumberMark + } + // 含有特殊符号 + } else if s.hasSpecialSymbol(part) { + pathParts[index] = MixSymbolMark + } else if chineseRegex.MatchString(part) { + pathParts[index] = ChineseMark + } else if unicodeRegex.MatchString(part) { + pathParts[index] = UnicodeMark + } else if onlyAlphaUpperRegex.MatchString(part) { + pathParts[index] = UpperMark + // 均为数字和一些符号组成 + } else if b := numSymbolRegex.ReplaceAllString(part, ""); onlyNumberRegex.MatchString(b) { + pathParts[index] = NumberMark + // 数字出现的次数超过3,视为伪静态path + } else if b := OneNumberRegex.ReplaceAllString(part, "0"); strings.Count(b, "0") > 3 { + pathParts[index] = MixNumMark + } + } + newPath := strings.Join(pathParts, "/") + return newPath +} + +/** +全局数值型参数过滤 +*/ +func (s *SmartFilter) globalFilterLocationMark(req *model.Request) { + name := req.URL.Hostname() + req.URL.Path + req.Method + if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { + for key := range req.Filter.MarkedQueryMap { + name += key + if s.filterLocationSet.Contains(name) { + req.Filter.MarkedQueryMap[key] = CustomValueMark + } + } + } else if req.Method == config.POST || req.Method == config.PUT { + for key := range req.Filter.MarkedPostDataMap { + name += key + if s.filterLocationSet.Contains(name) { + req.Filter.MarkedPostDataMap[key] = CustomValueMark + } + } + } +} + +/** +进行全局重复参数名、参数值、路径的统计标记 +之后对超过阈值的部分再次打标记 +*/ +func (s *SmartFilter) repeatCountStatistic(req *model.Request) { + queryKeyId := req.Filter.QueryKeysId + pathId := req.Filter.PathId + if queryKeyId != "" { + // 所有参数名重复数量统计 + if v, ok := s.filterParamKeyRepeatCount.Load(queryKeyId); ok { + s.filterParamKeyRepeatCount.Store(queryKeyId, v.(int)+1) + } else { + s.filterParamKeyRepeatCount.Store(queryKeyId, 1) + } + + for key, value := range req.Filter.MarkedQueryMap { + // 某个URL的所有参数名重复数量统计 + paramQueryKey := queryKeyId + key + + if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok { + set := set.(mapset.Set) + set.Add(value) + } else { + s.filterParamKeySingleValues.Store(paramQueryKey, mapset.NewSet(value)) + } + + //本轮所有URL中某个参数重复数量统计 + if _, ok := s.filterParamKeyAllValues.Load(key); !ok { + s.filterParamKeyAllValues.Store(key, mapset.NewSet(value)) + } else { + if v, ok := s.filterParamKeyAllValues.Load(key); ok { + set := v.(mapset.Set) + if !set.Contains(value) { + set.Add(value) + } + } + } + + // 如果参数值为空,统计该PATH下的空值参数名个数 + if value == "" { + if _, ok := s.filterPathParamEmptyValues.Load(pathId); !ok { + s.filterPathParamEmptyValues.Store(pathId, mapset.NewSet(key)) + } else { + if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok { + set := v.(mapset.Set) + if !set.Contains(key) { + set.Add(key) + } + } + } + } + + pathIdKey := pathId + key + // 某path下的参数值去重标记出现次数统计 + if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok { + if markedStringRegex.MatchString(value.(string)) { + s.filterPathParamKeySymbol.Store(pathIdKey, v.(int)+1) + } + } else { + s.filterPathParamKeySymbol.Store(pathIdKey, 1) + } + + } + } + + // 相对于上一级目录,本级path目录的数量统计,存在文件后缀的情况下,放行常见脚本后缀 + if req.URL.ParentPath() == "" || s.inCommonScriptSuffix(req.URL.FileExt()) { + return + } + + // + parentPathId := tools.StrMd5(req.URL.ParentPath()) + currentPath := strings.Replace(req.Filter.MarkedPath, req.URL.ParentPath(), "", -1) + if _, ok := s.filterParentPathValues.Load(parentPathId); !ok { + s.filterParentPathValues.Store(parentPathId, mapset.NewSet(currentPath)) + } else { + if v, ok := s.filterParentPathValues.Load(parentPathId); ok { + set := v.(mapset.Set) + if !set.Contains(currentPath) { + set.Add(currentPath) + } + } + } +} + +/** +对重复统计之后,超过阈值的部分再次打标记 +*/ +func (s *SmartFilter) overCountMark(req *model.Request) { + queryKeyId := req.Filter.QueryKeysId + pathId := req.Filter.PathId + // 参数不为空, + if req.Filter.QueryKeysId != "" { + // 某个URL的所有参数名重复数量超过阈值 且该参数有超过三个不同的值 则打标记 + if v, ok := s.filterParamKeyRepeatCount.Load(queryKeyId); ok && v.(int) > MaxParamKeySingleCount { + for key := range req.Filter.MarkedQueryMap { + paramQueryKey := queryKeyId + key + if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok { + set := set.(mapset.Set) + if set.Cardinality() > 3 { + req.Filter.MarkedQueryMap[key] = FixParamRepeatMark + } + } + } + } + + for key := range req.Filter.MarkedQueryMap { + // 所有URL中,某个参数不同的值出现次数超过阈值,打标记去重 + if paramKeySet, ok := s.filterParamKeyAllValues.Load(key); ok { + paramKeySet := paramKeySet.(mapset.Set) + if paramKeySet.Cardinality() > MaxParamKeyAllCount { + req.Filter.MarkedQueryMap[key] = FixParamRepeatMark + } + } + + pathIdKey := pathId + key + // 某个PATH的GET参数值去重标记出现次数超过阈值,则对该PATH的该参数进行全局标记 + if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok && v.(int) > MaxPathParamKeySymbolCount { + req.Filter.MarkedQueryMap[key] = FixParamRepeatMark + } + } + + // 处理某个path下空参数值的参数个数超过阈值 如伪静态: http://bang.360.cn/?chu_xiu + if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok { + set := v.(mapset.Set) + if set.Cardinality() > MaxPathParamEmptyCount { + newMarkerQueryMap := map[string]interface{}{} + for key, value := range req.Filter.MarkedQueryMap { + if value == "" { + newMarkerQueryMap[FixParamRepeatMark] = "" + } else { + newMarkerQueryMap[key] = value + } + } + req.Filter.MarkedQueryMap = newMarkerQueryMap + } + } + } + + // 处理本级path的伪静态 + if req.URL.ParentPath() == "" || s.inCommonScriptSuffix(req.URL.FileExt()) { + return + } + parentPathId := tools.StrMd5(req.URL.ParentPath()) + if set, ok := s.filterParentPathValues.Load(parentPathId); ok { + set := set.(mapset.Set) + if set.Cardinality() > MaxParentPathCount { + if strings.HasSuffix(req.URL.ParentPath(), "/") { + req.Filter.MarkedPath = req.URL.ParentPath() + FixPathMark + } else { + req.Filter.MarkedPath = req.URL.ParentPath() + "/" + FixPathMark + } + } + } +} + +/** +计算标记后的唯一请求ID +*/ +func (s *SmartFilter) getMarkedUniqueID(req *model.Request) string { + var paramId string + if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS { + paramId = req.Filter.QueryMapId + } else { + paramId = req.Filter.PostDataId + } + + uniqueStr := req.Method + paramId + req.Filter.PathId + req.URL.Host + if req.RedirectionFlag { + uniqueStr += "Redirection" + } + if req.URL.Path == "/" && req.URL.RawQuery == "" && req.URL.Scheme == "https" { + uniqueStr += "https" + } + + if req.URL.Fragment != "" && strings.HasPrefix(req.URL.Fragment, "/") { + uniqueStr += req.URL.Fragment + } + return tools.StrMd5(uniqueStr) +} + +/** +计算请求参数的key标记后的唯一ID +*/ +func (s *SmartFilter) getKeysID(dataMap map[string]interface{}) string { + var keys []string + var idStr string + for key := range dataMap { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + idStr += key + } + return tools.StrMd5(idStr) +} + +/** +计算请求参数标记后的唯一ID +*/ +func (s *SmartFilter) getParamMapID(dataMap map[string]interface{}) string { + var keys []string + var idStr string + var markReplaceRegex = regexp.MustCompile(`{{.+}}`) + for key := range dataMap { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + value := dataMap[key] + idStr += key + if value, ok := value.(string); ok { + idStr += markReplaceRegex.ReplaceAllString(value, "{{mark}}") + } + } + return tools.StrMd5(idStr) +} + +/** +计算PATH标记后的唯一ID +*/ +func (s *SmartFilter) getPathID(path string) string { + return tools.StrMd5(path) +} + +/** +判断字符串中是否存在以下特殊符号 +*/ +func (s *SmartFilter) hasSpecialSymbol(str string) bool { + symbolList := []string{"{", "}", " ", "|", "#", "@", "$", "*", ",", "<", ">", "/", "?", "\\", "+", "="} + for _, sym := range symbolList { + if strings.Contains(str, sym) { + return true + } + } + return false +} + +func (s *SmartFilter) inCommonScriptSuffix(suffix string) bool { + for _, value := range config.ScriptSuffix { + if value == suffix { + return true + } + } + return false +} diff --git a/pkg/js/javascript.go b/pkg/js/javascript.go new file mode 100755 index 0000000..3cb1736 --- /dev/null +++ b/pkg/js/javascript.go @@ -0,0 +1,476 @@ +package js + +import ( + "fmt" + "github.com/chromedp/cdproto/cdp" +) + +const TabInitJS = ` +(function addTabInitScript () { + + // Pass the Webdriver Test. + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Pass the Plugins Length Test. + // Overwrite the plugins property to use a custom getter. + Object.defineProperty(navigator, 'plugins', { + // This just needs to have length > 0 for the current test, + // but we could mock the plugins too if necessary. + get: () => [1, 2, 3, 4, 5], + }); + + // Pass the Chrome Test. + // We can mock this in as much depth as we need for the test. + window.chrome = { + runtime: {}, + }; + + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + + //Pass the Permissions Test. navigator.userAgent + Object.defineProperty(navigator, 'userAgent', { + get: () => "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36", + }); + + // 修改浏览器对象的属性 + Object.defineProperty(navigator, 'platform', { + get: function () { return 'win32'; } + }); + + Object.defineProperty(navigator, 'language', { + get: function () { return 'zh-CN'; } + }); + + Object.defineProperty(navigator, 'languages', { + get: function () { return ["zh-CN", "zh"]; } + }); + + // history api hook + window.history.pushState = function(a, b, c) { + window.addLink(c, "HistoryAPI"); + } + window.history.replaceState = function(a, b, c) { + window.addLink(c, "HistoryAPI"); + } + Object.defineProperty(window.history,"pushState",{"writable": false, "configurable": false}); + Object.defineProperty(window.history,"replaceState",{"writable": false, "configurable": false}); + // 监听hash改变 + window.addEventListener("hashchange", function() { + window.addLink(document.location.href, "HashChange"); + }); + + var oldWebSocket = window.WebSocket; + window.WebSocket = function(url, arg) { + window.addLink(url, "WebSocket"); + return new oldWebSocket(url, arg); + } + + var oldEventSource = window.EventSource; + window.EventSource = function(url) { + window.addLink(url, "EventSource"); + return new oldEventSource(url); + } + + var oldFetch = window.fetch; + window.fetch = function(url) { + window.addLink(url, "Fetch"); + return oldFetch(url); + } + + // 锁定表单重置 + HTMLFormElement.prototype.reset = function() {console.log("cancel reset form")}; + Object.defineProperty(HTMLFormElement.prototype,"reset",{"writable": false, "configurable": false}); + + // hook dom2 级事件监听 + window.add_even_listener_count_sec_auto = {}; + // record event func , hook addEventListener + let old_event_handle = Element.prototype.addEventListener; + Element.prototype.addEventListener = function(event_name, event_func, useCapture) { + let name = "<" + this.tagName + "> " + this.id + this.name + this.getAttribute("class") + "|" + event_name; + // console.log(name) + // 对每个事件设定最大的添加次数,防止无限触发,最大次数为5 + if (!window.add_even_listener_count_sec_auto.hasOwnProperty(name)) { + window.add_even_listener_count_sec_auto[name] = 1; + } else if (window.add_even_listener_count_sec_auto[name] == 5) { + return ; + } else { + window.add_even_listener_count_sec_auto[name] += 1; + } + if (this.hasAttribute("sec_auto_dom2_event_flag")) { + let sec_auto_dom2_event_flag = this.getAttribute("sec_auto_dom2_event_flag"); + this.setAttribute("sec_auto_dom2_event_flag", sec_auto_dom2_event_flag + "|" + event_name); + } else { + this.setAttribute("sec_auto_dom2_event_flag", event_name); + } + old_event_handle.apply(this, arguments); + }; + + function dom0_listener_hook(that, event_name) { + let name = "<" + that.tagName + "> " + that.id + that.name + that.getAttribute("class") + "|" + event_name; + // console.log(name); + // 对每个事件设定最大的添加次数,防止无限触发,最大次数为5 + if (!window.add_even_listener_count_sec_auto.hasOwnProperty(name)) { + window.add_even_listener_count_sec_auto[name] = 1; + } else if (window.add_even_listener_count_sec_auto[name] == 5) { + return ; + } else { + window.add_even_listener_count_sec_auto[name] += 1; + } + if (that.hasAttribute("sec_auto_dom2_event_flag")) { + let sec_auto_dom2_event_flag = that.getAttribute("sec_auto_dom2_event_flag"); + that.setAttribute("sec_auto_dom2_event_flag", sec_auto_dom2_event_flag + "|" + event_name); + } else { + that.setAttribute("sec_auto_dom2_event_flag", event_name); + } + } + + // hook dom0 级事件监听 + Object.defineProperties(HTMLElement.prototype, { + onclick: {set: function(newValue){onclick = newValue;dom0_listener_hook(this, "click");}}, + onchange: {set: function(newValue){onchange = newValue;dom0_listener_hook(this, "change");}}, + onblur: {set: function(newValue){onblur = newValue;dom0_listener_hook(this, "blur");}}, + ondblclick: {set: function(newValue){ondblclick = newValue;dom0_listener_hook(this, "dbclick");}}, + onfocus: {set: function(newValue){onfocus = newValue;dom0_listener_hook(this, "focus");}}, + onkeydown: {set: function(newValue){onkeydown = newValue;dom0_listener_hook(this, "keydown");}}, + onkeypress: {set: function(newValue){onkeypress = newValue;dom0_listener_hook(this, "keypress");}}, + onkeyup: {set: function(newValue){onkeyup = newValue;dom0_listener_hook(this, "keyup");}}, + onload: {set: function(newValue){onload = newValue;dom0_listener_hook(this, "load");}}, + onmousedown: {set: function(newValue){onmousedown = newValue;dom0_listener_hook(this, "mousedown");}}, + onmousemove: {set: function(newValue){onmousemove = newValue;dom0_listener_hook(this, "mousemove");}}, + onmouseout: {set: function(newValue){onmouseout = newValue;dom0_listener_hook(this, "mouseout");}}, + onmouseover: {set: function(newValue){onmouseover = newValue;dom0_listener_hook(this, "mouseover");}}, + onmouseup: {set: function(newValue){onmouseup = newValue;dom0_listener_hook(this, "mouseup");}}, + onreset: {set: function(newValue){onreset = newValue;dom0_listener_hook(this, "reset");}}, + onresize: {set: function(newValue){onresize = newValue;dom0_listener_hook(this, "resize");}}, + onselect: {set: function(newValue){onselect = newValue;dom0_listener_hook(this, "select");}}, + onsubmit: {set: function(newValue){onsubmit = newValue;dom0_listener_hook(this, "submit");}}, + onunload: {set: function(newValue){onunload = newValue;dom0_listener_hook(this, "unload");}}, + onabort: {set: function(newValue){onabort = newValue;dom0_listener_hook(this, "abort");}}, + onerror: {set: function(newValue){onerror = newValue;dom0_listener_hook(this, "error");}}, + }) + + // hook window.open + window.open = function (url) { + console.log("trying to open window."); + window.addLink(url, "OpenWindow"); + } + Object.defineProperty(window,"open",{"writable": false, "configurable": false}); + + // hook window close + window.close = function() {console.log("trying to close page.");}; + Object.defineProperty(window,"close",{"writable": false, "configurable": false}); + + // hook setTimeout + //window.__originalSetTimeout = window.setTimeout; + //window.setTimeout = function() { + // arguments[1] = 0; + // return window.__originalSetTimeout.apply(this, arguments); + //}; + //Object.defineProperty(window,"setTimeout",{"writable": false, "configurable": false}); + + // hook setInterval 时间设置为60秒 目的是减轻chrome的压力 + window.__originalSetInterval = window.setInterval; + window.setInterval = function() { + arguments[1] = 60000; + return window.__originalSetInterval.apply(this, arguments); + }; + Object.defineProperty(window,"setInterval",{"writable": false, "configurable": false}); + + // 劫持原生ajax,并对每个请求设置最大请求次数 + window.ajax_req_count_sec_auto = {}; + XMLHttpRequest.prototype.__originalOpen = XMLHttpRequest.prototype.open; + XMLHttpRequest.prototype.open = function(method, url, async, user, password) { + // hook code + this.url = url; + this.method = method; + let name = method + url; + if (!window.ajax_req_count_sec_auto.hasOwnProperty(name)) { + window.ajax_req_count_sec_auto[name] = 1 + } else { + window.ajax_req_count_sec_auto[name] += 1 + } + + if (window.ajax_req_count_sec_auto[name] <= 10) { + return this.__originalOpen(method, url, true, user, password); + } + } + Object.defineProperty(XMLHttpRequest.prototype,"open",{"writable": false, "configurable": false}); + + XMLHttpRequest.prototype.__originalSend = XMLHttpRequest.prototype.send; + XMLHttpRequest.prototype.send = function(data) { + // hook code + let name = this.method + this.url; + if (window.ajax_req_count_sec_auto[name] <= 10) { + return this.__originalSend(data); + } + } + Object.defineProperty(XMLHttpRequest.prototype,"send",{"writable": false, "configurable": false}); + + XMLHttpRequest.prototype.__originalAbort = XMLHttpRequest.prototype.abort; + XMLHttpRequest.prototype.abort = function() { + // hook code + } + Object.defineProperty(XMLHttpRequest.prototype,"abort",{"writable": false, "configurable": false}); + + // 打乱数组的方法 + window.randArr = function (arr) { + for (var i = 0; i < arr.length; i++) { + var iRand = parseInt(arr.length * Math.random()); + var temp = arr[i]; + arr[i] = arr[iRand]; + arr[iRand] = temp; + } + return arr; + } + + window.sleep = function(time) { + return new Promise((resolve) => setTimeout(resolve, time)); + } + + Array.prototype.indexOf = function(val) { + for (var i = 0; i < this.length; i++) { + if (this[i] == val) return i; + } + return -1; + }; + + Array.prototype.remove = function(val) { + var index = this.indexOf(val); + if (index > -1) { + this.splice(index, 1); + } + }; + + const binding = window["addLink"]; + window["addLink"] = async(...args) => { + const me = window["addLink"]; + let callbacks = me['callbacks']; + if (!callbacks) { + callbacks = new Map(); + me['callbacks'] = callbacks; + } + const seq = (me['lastSeq'] || 0) + 1; + me['lastSeq'] = seq; + const promise = new Promise(fulfill => callbacks.set(seq, fulfill)); + binding(JSON.stringify({name: "addLink", seq, args})); + return promise; + }; + + const bindingTest = window["Test"]; + window["Test"] = async(...args) => { + const me = window["Test"]; + let callbacks = me['callbacks']; + if (!callbacks) { + callbacks = new Map(); + me['callbacks'] = callbacks; + } + const seq = (me['lastSeq'] || 0) + 1; + me['lastSeq'] = seq; + const promise = new Promise(fulfill => callbacks.set(seq, fulfill)); + binding(JSON.stringify({name: "Test", seq, args})); + return promise; + }; +})(); +` + +const DeliverResultJS = ` +(function deliverResult(name, seq, result) { + window[name]['callbacks'].get(seq)(result); + window[name]['callbacks'].delete(seq); +})("%s", %v, "%s") +` + +const ObserverJS = ` +(function init_observer_sec_auto_b() { + window.dom_listener_func_sec_auto = function (e) { + let node = e.target; + let nodeListSrc = node.querySelectorAll("[src]"); + for (let each of nodeListSrc) { + if (each.src) { + window.addLink(each.src, "DOM"); + let attrValue = each.getAttribute("src"); + if (attrValue.toLocaleLowerCase().startsWith("javascript:")) { + try { + eval(attrValue.substring(11)); + } + catch {} + } + } + } + + let nodeListHref = node.querySelectorAll("[href]"); + nodeListHref = window.randArr(nodeListHref); + for (let each of nodeListHref) { + if (each.href) { + window.addLink(each.href, "DOM"); + let attrValue = each.getAttribute("href"); + if (attrValue.toLocaleLowerCase().startsWith("javascript:")) { + try { + eval(attrValue.substring(11)); + } + catch {} + } + } + } + }; + document.addEventListener('DOMNodeInserted', window.dom_listener_func_sec_auto, true); + document.addEventListener('DOMSubtreeModified', window.dom_listener_func_sec_auto, true); + document.addEventListener('DOMNodeInsertedIntoDocument', window.dom_listener_func_sec_auto, true); + document.addEventListener('DOMAttrModified', window.dom_listener_func_sec_auto, true); +})() +` + +const RemoveDOMListenerJS = ` +(function remove_dom_listener() { + document.removeEventListener('DOMNodeInserted', window.dom_listener_func_sec_auto, true); + document.removeEventListener('DOMSubtreeModified', window.dom_listener_func_sec_auto, true); + document.removeEventListener('DOMNodeInsertedIntoDocument', window.dom_listener_func_sec_auto, true); + document.removeEventListener('DOMAttrModified', window.dom_listener_func_sec_auto, true); +})() +` + +const NewFrameTemplate = ` +(function sec_auto_new_iframe () { + let frame = document.createElement("iframe"); + frame.setAttribute("name", "%s"); + frame.setAttribute("id", "%s"); + frame.setAttribute("style", "display: none"); + document.body.appendChild(frame); +})() +` + +const TriggerInlineEventJS = ` +(async function trigger_all_inline_event(){ + let eventNames = ["onabort", "onblur", "onchange", "onclick", "ondblclick", "onerror", "onfocus", "onkeydown", "onkeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onresize", "onselect", "onsubmit", "onunload"]; + for (let eventName of eventNames) { + let event = eventName.replace("on", ""); + let nodeList = document.querySelectorAll("[" + eventName + "]"); + if (nodeList.length > 100) { + nodeList = nodeList.slice(0, 100); + } + nodeList = window.randArr(nodeList); + for (let node of nodeList) { + await window.sleep(%f); + let evt = document.createEvent('CustomEvent'); + evt.initCustomEvent(event, false, true, null); + try { + node.dispatchEvent(evt); + } + catch {} + } + } +})() +` + +const TriggerDom2EventJS = ` +(async function trigger_all_dom2_custom_event() { + function transmit_child(node, event, loop) { + let _loop = loop + 1 + if (_loop > 4) { + return; + } + if (node.nodeType === 1) { + if (node.hasChildNodes) { + let index = parseInt(Math.random()*node.children.length,10); + try { + node.children[index].dispatchEvent(event); + } catch(e) {} + let max = node.children.length>5?5:node.children.length; + for (let count=0;count