This repository has been archived by the owner on Sep 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.go
134 lines (115 loc) · 2.68 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package main
import (
"log"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
)
const (
TIME_START = "start"
)
type PageResponse struct {
Url string
StatusCode int
Data []byte
Depth int
Duration time.Duration
Headers *http.Header
}
type Scraper struct {
ID int32
MaxDepth int
Website string
Recursively bool
PrintLogs bool
Async bool
waitGroup sync.WaitGroup
stats map[string]*PageStats
mutex *sync.Mutex
}
func prepareAllowedDomain(requestURL string) ([]string, error) {
u, err := url.ParseRequestURI(requestURL)
if err != nil {
return nil, err
}
hostname := u.Hostname()
domain := strings.TrimLeft(hostname, "wwww.")
return []string{
domain,
"www." + domain,
"http://" + domain,
"https://" + domain,
"http://www." + domain,
"https://www." + domain,
}, nil
}
func (s *Scraper) Log(v ...interface{}) {
if s.PrintLogs {
log.Print("Scraper ", "#", s.ID, " ", v)
}
}
func (s *Scraper) Scrape() error {
s.Log("About to scrap", s.Website)
//c := colly.NewCollector(colly.Debugger(&debug.LogDebugger{}))
c := colly.NewCollector()
extensions.RandomUserAgent(c)
c.MaxDepth = s.MaxDepth
c.Async = s.Async
allowedDomains, err := prepareAllowedDomain(s.Website)
if err != nil {
s.Log("Failed to prepare allowed domains.", err)
return err
}
c.AllowedDomains = allowedDomains
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put(TIME_START, time.Now())
})
c.OnResponse(func(r *colly.Response) {
s.Log("Received response from", r.Request.URL.String())
p := &PageResponse{
Url: r.Request.URL.String(),
Data: r.Body,
StatusCode: r.StatusCode,
Depth: r.Request.Depth,
Headers: r.Headers,
}
start := r.Ctx.GetAny(TIME_START)
if start != nil {
duration := time.Now().Sub(start.(time.Time))
p.Duration = duration
}
s.waitGroup.Add(1)
go s.processPage(p)
})
// Find and visit all links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
s.Log("visiting: ", link)
if err := e.Request.Visit(link); err != nil {
if err != colly.ErrAlreadyVisited {
s.Log("error while linking: ", err.Error())
}
}
})
// Start the scrape
if err := c.Visit(s.Website); err != nil {
s.Log("error while visiting:", err.Error())
}
s.Log("Waiting for the scape to finish...")
c.Wait()
s.Log("Waiting for the page processing to finish...")
s.waitGroup.Wait()
return nil
}
func (s *Scraper) Report() []*PageStats {
s.Log("Reporting stats count", len(s.stats))
result := make([]*PageStats, 0, len(s.stats))
for _, v := range s.stats {
result = append(result, v)
}
return result
}