Skip to content

Commit

Permalink
fix: fix the duplicate URL cases
Browse files Browse the repository at this point in the history
  • Loading branch information
chuang8511 committed Nov 18, 2024
1 parent 1d23911 commit cdfcada
Showing 1 changed file with 11 additions and 6 deletions.
17 changes: 11 additions & 6 deletions pkg/component/operator/web/v0/crawl_website.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,18 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
return
}

if util.InSlice(pageLinks, link) {
parsedURL, err := url.Parse(link)
if err != nil {
return
}

requestURL := stripQueryAndTrailingSlash(parsedURL)

if util.InSlice(pageLinks, requestURL.String()) {
return
}

pageLinks = append(pageLinks, link)
pageLinks = append(pageLinks, requestURL.String())

_ = e.Request.Visit(link)
})
Expand Down Expand Up @@ -194,18 +201,16 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
return
}

strippedURL := stripQueryAndTrailingSlash(r.Request.URL)

page := PageInfo{}

page.Link = strippedURL.String()
page.Link = r.Request.URL.String()

html := string(r.Body)
ioReader := strings.NewReader(html)
doc, err := goquery.NewDocumentFromReader(ioReader)

if err != nil {
fmt.Printf("Error parsing %s: %v", strippedURL.String(), err)
fmt.Printf("Error parsing %s: %v", r.Request.URL.String(), err)
return
}

Expand Down

0 comments on commit cdfcada

Please sign in to comment.