Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
HeisenbergV committed Jan 5, 2023
1 parent 7d4f486 commit ca0760d
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 71 deletions.
7 changes: 7 additions & 0 deletions pkg/filter/filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package filter

import "github.com/Qianlitp/crawlergo/pkg/model"

type FilterHandler interface {
DoFilter(req *model.Request) bool
}
27 changes: 16 additions & 11 deletions pkg/filter/simple_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,23 @@ import (
)

type SimpleFilter struct {
UniqueSet mapset.Set
HostLimit string
UniqueSet mapset.Set
HostLimit string
staticSuffixSet mapset.Set
}

var (
staticSuffixSet = config.StaticSuffixSet.Clone()
)
func NewSimpleFilter(host string) *SimpleFilter {
staticSuffixSet := config.StaticSuffixSet.Clone()

func init() {
for _, suffix := range []string{"js", "css", "json"} {
staticSuffixSet.Add(suffix)
}
s := &SimpleFilter{UniqueSet: mapset.NewSet(), staticSuffixSet: staticSuffixSet, HostLimit: host}
return s
}

/**
/*
*
需要过滤则返回 true
*/
func (s *SimpleFilter) DoFilter(req *model.Request) bool {
Expand All @@ -45,7 +47,8 @@ func (s *SimpleFilter) DoFilter(req *model.Request) bool {
return false
}

/**
/*
*
请求去重
*/
func (s *SimpleFilter) UniqueFilter(req *model.Request) bool {
Expand All @@ -60,7 +63,8 @@ func (s *SimpleFilter) UniqueFilter(req *model.Request) bool {
}
}

/**
/*
*
静态资源过滤
*/
func (s *SimpleFilter) StaticFilter(req *model.Request) bool {
Expand All @@ -72,13 +76,14 @@ func (s *SimpleFilter) StaticFilter(req *model.Request) bool {
if req.URL.FileExt() == "" {
return false
}
if staticSuffixSet.Contains(req.URL.FileExt()) {
if s.staticSuffixSet.Contains(req.URL.FileExt()) {
return true
}
return false
}

/**
/*
*
只保留指定域名的链接
*/
func (s *SimpleFilter) DomainFilter(req *model.Request) bool {
Expand Down
55 changes: 37 additions & 18 deletions pkg/filter/smart_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ import (
)

type SmartFilter struct {
StrictMode bool
SimpleFilter SimpleFilter
StrictMode bool
*SimpleFilter
filterLocationSet mapset.Set // 非逻辑型参数的位置记录 全局统一标记过滤
filterParamKeyRepeatCount sync.Map
filterParamKeySingleValues sync.Map // 所有参数名重复数量统计
Expand Down Expand Up @@ -74,7 +74,8 @@ var onlyAlphaNumRegex = regexp.MustCompile(`^[0-9a-zA-Z]+$`)
var markedStringRegex = regexp.MustCompile(`^{{.+}}$`)
var htmlReplaceRegex = regexp.MustCompile(`\.shtml|\.html|\.htm`)

func (s *SmartFilter) Init() {
func NewSmartFilter(base *SimpleFilter, strictMode bool) *SmartFilter {
s := &SmartFilter{}
s.filterLocationSet = mapset.NewSet()
s.filterParamKeyRepeatCount = sync.Map{}
s.filterParamKeySingleValues = sync.Map{}
Expand All @@ -83,9 +84,13 @@ func (s *SmartFilter) Init() {
s.filterPathParamEmptyValues = sync.Map{}
s.filterParentPathValues = sync.Map{}
s.uniqueMarkedIds = mapset.NewSet()
s.SimpleFilter = base
s.StrictMode = strictMode
return s
}

/**
/*
*
智能去重
可选严格模式
Expand Down Expand Up @@ -149,7 +154,8 @@ func (s *SmartFilter) DoFilter(req *model.Request) bool {
return false
}

/**
/*
*
Query的Map对象会自动解码,所以对RawQuery进行预先的标记
*/
func (s *SmartFilter) preQueryMark(rawQuery string) string {
Expand All @@ -163,7 +169,8 @@ func (s *SmartFilter) preQueryMark(rawQuery string) string {
return rawQuery
}

/**
/*
*
对GET请求的参数和路径进行标记
*/
func (s *SmartFilter) getMark(req *model.Request) {
Expand Down Expand Up @@ -199,7 +206,8 @@ func (s *SmartFilter) getMark(req *model.Request) {
req.Filter.UniqueId = getMarkedUniqueID(req)
}

/**
/*
*
对POST请求的参数和路径进行标记
*/
func (s *SmartFilter) postMark(req *model.Request) {
Expand Down Expand Up @@ -227,7 +235,8 @@ func (s *SmartFilter) postMark(req *model.Request) {
req.Filter.UniqueId = getMarkedUniqueID(req)
}

/**
/*
*
标记参数名
*/
func markParamName(paramMap map[string]interface{}) map[string]interface{} {
Expand All @@ -248,7 +257,8 @@ func markParamName(paramMap map[string]interface{}) map[string]interface{} {
return markedParamMap
}

/**
/*
*
标记参数值
*/
func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model.Request) map[string]interface{} {
Expand Down Expand Up @@ -336,7 +346,8 @@ func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model.
return markedParamMap
}

/**
/*
*
标记路径
*/
func MarkPath(path string) string {
Expand Down Expand Up @@ -376,7 +387,8 @@ func MarkPath(path string) string {
return newPath
}

/**
/*
*
全局数值型参数过滤
*/
func (s *SmartFilter) globalFilterLocationMark(req *model.Request) {
Expand All @@ -398,7 +410,8 @@ func (s *SmartFilter) globalFilterLocationMark(req *model.Request) {
}
}

/**
/*
*
进行全局重复参数名、参数值、路径的统计标记
之后对超过阈值的部分再次打标记
*/
Expand Down Expand Up @@ -483,7 +496,8 @@ func (s *SmartFilter) repeatCountStatistic(req *model.Request) {
}
}

/**
/*
*
对重复统计之后,超过阈值的部分再次打标记
*/
func (s *SmartFilter) overCountMark(req *model.Request) {
Expand Down Expand Up @@ -571,7 +585,8 @@ func (s *SmartFilter) calcFragmentID(fragment string) string {
return fakeReq.Filter.UniqueId
}

/**
/*
*
计算标记后的唯一请求ID
*/
func getMarkedUniqueID(req *model.Request) string {
Expand All @@ -593,7 +608,8 @@ func getMarkedUniqueID(req *model.Request) string {
return tools.StrMd5(uniqueStr)
}

/**
/*
*
计算请求参数的key标记后的唯一ID
*/
func getKeysID(dataMap map[string]interface{}) string {
Expand All @@ -609,7 +625,8 @@ func getKeysID(dataMap map[string]interface{}) string {
return tools.StrMd5(idStr)
}

/**
/*
*
计算请求参数标记后的唯一ID
*/
func getParamMapID(dataMap map[string]interface{}) string {
Expand All @@ -630,14 +647,16 @@ func getParamMapID(dataMap map[string]interface{}) string {
return tools.StrMd5(idStr)
}

/**
/*
*
计算PATH标记后的唯一ID
*/
func getPathID(path string) string {
return tools.StrMd5(path)
}

/**
/*
*
判断字符串中是否存在以下特殊符号
*/
func hasSpecialSymbol(str string) bool {
Expand Down
3 changes: 1 addition & 2 deletions pkg/filter/smart_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@ var (
// completeUrls = []string{
// "https://test.local.com:1234/adfatd/123456/sx14xi?user=crawlergo&pwd=fa1424&end=1#/user/info",
// }
smart = SmartFilter{}
smart = NewSmartFilter(NewSimpleFilter(""), true)
)

func TestDoFilter_countFragment(t *testing.T) {
smart.Init()
reqs := []model.Request{}
for _, fu := range fragmentUrls {
url, err := model.GetUrl(fu)
Expand Down
Loading

0 comments on commit ca0760d

Please sign in to comment.