Skip to content

Commit

Permalink
perf(static-cmp): Static resource matching optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
PIGfaces committed Jul 1, 2022
1 parent dbf7064 commit 4a61dac
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 37 deletions.
49 changes: 36 additions & 13 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package config

import "time"
import (
"time"

mapset "github.com/deckarep/golang-set"
)

const (
DefaultUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36"
Expand Down Expand Up @@ -68,19 +72,25 @@ const (
MULTIPART = "multipart/form-data"
)

var StaticSuffix = []string{
"png", "gif", "jpg", "mp4", "mp3", "mng", "pct", "bmp", "jpeg", "pst", "psp", "ttf",
"tif", "tiff", "ai", "drw", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff",
"dxf", "eps", "ps", "svg", "3gp", "asf", "asx", "avi", "mov", "mpg", "qt", "rm",
"wmv", "m4a", "bin", "xls", "xlsx", "ppt", "pptx", "doc", "docx", "odt", "ods", "odg",
"odp", "exe", "zip", "rar", "tar", "gz", "iso", "rss", "pdf", "txt", "dll", "ico",
"gz2", "apk", "crt", "woff", "map", "woff2", "webp", "less", "dmg", "bz2", "otf", "swf",
"flv", "mpeg", "dat", "xsl", "csv", "cab", "exif", "wps", "m4v", "rmvb",
}
var (
StaticSuffix = []string{
"png", "gif", "jpg", "mp4", "mp3", "mng", "pct", "bmp", "jpeg", "pst", "psp", "ttf",
"tif", "tiff", "ai", "drw", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff",
"dxf", "eps", "ps", "svg", "3gp", "asf", "asx", "avi", "mov", "mpg", "qt", "rm",
"wmv", "m4a", "bin", "xls", "xlsx", "ppt", "pptx", "doc", "docx", "odt", "ods", "odg",
"odp", "exe", "zip", "rar", "tar", "gz", "iso", "rss", "pdf", "txt", "dll", "ico",
"gz2", "apk", "crt", "woff", "map", "woff2", "webp", "less", "dmg", "bz2", "otf", "swf",
"flv", "mpeg", "dat", "xsl", "csv", "cab", "exif", "wps", "m4v", "rmvb",
}
StaticSuffixSet mapset.Set
)

var ScriptSuffix = []string{
"php", "asp", "jsp", "asa",
}
var (
ScriptSuffix = []string{
"php", "asp", "jsp", "asa",
}
ScriptSuffixSet mapset.Set
)

var DefaultIgnoreKeywords = []string{"logout", "quit", "exit"}
var AllowedFormName = []string{"default", "mail", "code", "phone", "username", "password", "qq", "id_card", "url", "date", "number"}
Expand Down Expand Up @@ -129,3 +139,16 @@ var InputTextMap = map[string]map[string]interface{}{
"value": "10",
},
}

func init() {
StaticSuffixSet = initSet(StaticSuffix)
ScriptSuffixSet = initSet(ScriptSuffix)
}

func initSet(suffixs []string) mapset.Set {
set := mapset.NewSet()
for _, s := range suffixs {
set.Add(s)
}
return set
}
16 changes: 16 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package config_test

import (
"testing"

"github.com/Qianlitp/crawlergo/pkg/config"
"github.com/stretchr/testify/assert"
)

func TestStaticSuffix(t *testing.T) {
assert.Equal(t, true, config.StaticSuffixSet.Contains("png"))
assert.Equal(t, false, config.StaticSuffixSet.Contains("demo"))

assert.Equal(t, true, config.ScriptSuffixSet.Contains("asp"))
assert.Equal(t, false, config.ScriptSuffixSet.Contains("demo"))
}
13 changes: 6 additions & 7 deletions pkg/engine/intercept_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,12 @@ func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
tab.HandleHostBinding(&req)

// 静态资源 全部阻断
for _, suffix := range config.StaticSuffix {
if strings.HasSuffix(strings.ToLower(url.Path), suffix) {
_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
req.Source = config.FromStaticRes
tab.AddResultRequest(req)
return
}
// https://github.com/Qianlitp/crawlergo/issues/106
if config.StaticSuffixSet.Contains(url.FileExt()) {
_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
req.Source = config.FromStaticRes
tab.AddResultRequest(req)
return
}

// 处理导航请求
Expand Down
17 changes: 11 additions & 6 deletions pkg/filter/simple_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ type SimpleFilter struct {
HostLimit string
}

var (
staticSuffixSet = config.StaticSuffixSet.Clone()
)

func init() {
for _, suffix := range []string{"js", "css", "json"} {
staticSuffixSet.Add(suffix)
}
}

/**
需要过滤则返回 true
*/
Expand Down Expand Up @@ -58,16 +68,11 @@ func (s *SimpleFilter) StaticFilter(req *model.Request) bool {
s.UniqueSet = mapset.NewSet()
}
// 首先将slice转换成map
extMap := map[string]int{}
staticSuffix := append(config.StaticSuffix, "js", "css", "json")
for _, suffix := range staticSuffix {
extMap[suffix] = 1
}

if req.URL.FileExt() == "" {
return false
}
if _, ok := extMap[req.URL.FileExt()]; ok {
if staticSuffixSet.Contains(req.URL.FileExt()) {
return true
}
return false
Expand Down
7 changes: 1 addition & 6 deletions pkg/filter/smart_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -651,10 +651,5 @@ func hasSpecialSymbol(str string) bool {
}

func inCommonScriptSuffix(suffix string) bool {
for _, value := range config.ScriptSuffix {
if value == suffix {
return true
}
}
return false
return config.ScriptSuffixSet.Contains(suffix)
}
11 changes: 6 additions & 5 deletions pkg/model/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"net/url"
"path"
"regexp"
"strings"

Expand Down Expand Up @@ -158,12 +159,12 @@ func (u *URL) FileName() string {
文件扩展名
*/
func (u *URL) FileExt() string {
fileName := u.FileName()
if fileName == "" {
return ""
parts := path.Ext(u.Path)
// 第一个字符会带有 "."
if len(parts) > 0 {
return strings.ToLower(parts[1:])
}
parts := strings.Split(fileName, ".")
return strings.ToLower(parts[len(parts)-1])
return parts
}

/**
Expand Down
16 changes: 16 additions & 0 deletions pkg/model/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,22 @@ func TestRootDomain(t *testing.T) {
}
}

func TestFileExt(t *testing.T) {
noExtPath := "/user/info"
hasExtPath := "/user/info.html"
hasExtPathMoreChar := "/user/info.html%2"
url, err := GetUrl(noExtPath)
assert.Nil(t, err)
assert.NotNil(t, url)
assert.Equal(t, "", url.FileExt())
hasExtUrl, err := GetUrl(hasExtPath)
assert.Nil(t, err)
assert.Equal(t, "html", hasExtUrl.FileExt())
hasExtChar, err := GetUrl(hasExtPathMoreChar)
assert.Nil(t, err)
assert.Equal(t, "html%2", hasExtChar.FileExt())
}

func TestGetUrl(t *testing.T) {
testPath := "/user/info"
testQueyPath := "/user/info?keyword=crawlergocrawlergo&end=1"
Expand Down

0 comments on commit 4a61dac

Please sign in to comment.