diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..66f8fb50 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +.vscode/ diff --git a/VERSION b/VERSION index 7ec1d6db..3e3c2f1e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 +2.1.1 diff --git a/cache.go b/cache.go new file mode 100644 index 00000000..a6260a47 --- /dev/null +++ b/cache.go @@ -0,0 +1,142 @@ +// Copyright 2023 Adam Tauber, Andrzej Lichnerowicz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package colly + +import ( + "crypto/sha1" + "encoding/gob" + "encoding/hex" + "errors" + "net/http" + "os" + "path" +) + +// Cache is an interface which handles caching Collector's responses +// The default Cache of the Collector is the NullCache. +// FileSystemCache keeps compatibility with non-pluggable caching in legacy +// Collector. For this reason, one can set cache folder via `CACHE_DIR` +// environment variable, or by passing CacheDir to NewCollector. +// Collector's caching backend can be changed too by calling new method +// Collector.SetCache +type Cache interface { + // Init initializes the caching backend + Init() error + // Get retrieves a previously cached response for the given request + Get(request *http.Request) (*Response, error) + // Put stores a response given request in cache + Put(request *http.Request, response *Response) error + // Close finalizes caching backend + Close() error +} + +const ( + // DefaultCacheFolderPermissions is set to rwx(user), rx(group), nothing for others + DefaultCacheFolderPermissions = 0750 +) + +var ( + ErrCacheFolderNotConfigured = errors.New("Cache's base folder cannot be empty") + ErrCacheNotConfigured = errors.New("Caching backend is not configured") + ErrRequestNoCache = errors.New("Request cannot be cached") + ErrCachedNotFound = errors.New("Cached response not found") +) + +type NullCache struct { +} + +func (c *NullCache) Init() error { + return nil +} + +// Get always retrieves an error to force re-download +func (c *NullCache) Get(request *http.Request) (*Response, error) { + return nil, ErrCachedNotFound +} + +func (c *NullCache) Put(request *http.Request, response *Response) error { + return nil +} + +func (c *NullCache) Close() error { + return nil +} + +// FileSystemCache is the default cache backend of colly. +// FileSystemCache keeps responses persisted on the disk. +type FileSystemCache struct { + BaseDir string +} + +// Init ensures that specified base folder exists +func (c *FileSystemCache) Init() error { + if c.BaseDir == "" { + return ErrCacheFolderNotConfigured + } + + return os.MkdirAll(c.BaseDir, DefaultCacheFolderPermissions) +} + +func (c *FileSystemCache) getFilenameFromRequest(request *http.Request) (string, string) { + sum := sha1.Sum([]byte(request.URL.String())) + hash := hex.EncodeToString(sum[:]) + dir := path.Join(c.BaseDir, hash[:2]) + return dir, path.Join(dir, hash) +} + +// Get returns an error for HTTP verbs other than GET and if request headers +// specify `Cache-Control: no-cache`. +func (c *FileSystemCache) Get(request *http.Request) (*Response, error) { + if request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { + return nil, ErrRequestNoCache + } + + _, filename := c.getFilenameFromRequest(request) + + if file, err := os.Open(filename); err == nil { + resp := new(Response) + err = gob.NewDecoder(file).Decode(resp) + file.Close() + return resp, err + } else { + return nil, err + } +} + +// Put persists response on disk. For compatibility with legacy non-pluggable version, +// it keeps only one level of folder hierarchy. +func (c *FileSystemCache) Put(request *http.Request, response *Response) error { + dir, filename := c.getFilenameFromRequest(request) + + if _, err := os.Stat(dir); err != nil { + if err := os.MkdirAll(dir, DefaultCacheFolderPermissions); err != nil { + return err + } + } + file, err := os.Create(filename + "~") + if err != nil { + return err + } + if err := gob.NewEncoder(file).Encode(response); err != nil { + file.Close() + return err + } + file.Close() + return os.Rename(filename+"~", filename) +} + +func (c *FileSystemCache) Close() error { + return nil +} diff --git a/colly.go b/colly.go index fdca9451..9df359f7 100644 --- a/colly.go +++ b/colly.go @@ -85,9 +85,6 @@ type Collector struct { // 0 means unlimited. // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes). MaxBodySize int - // CacheDir specifies a location where GET requests are cached as files. - // When it's not defined, caching is disabled. - CacheDir string // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by // the target host's robots.txt file. See http://www.robotstxt.org/ for more // information. @@ -119,6 +116,7 @@ type Collector struct { // Set it to 0 for infinite requests (default). MaxRequests uint32 + cache Cache store storage.Storage debugger debug.Debugger robotsMap map[string]*robotstxt.RobotsData @@ -240,7 +238,14 @@ var envMap = map[string]func(*Collector, string){ c.AllowedDomains = strings.Split(val, ",") }, "CACHE_DIR": func(c *Collector, val string) { - c.CacheDir = val + if c.cache != nil { + c.cache.Close() + } + c.cache = &FileSystemCache{ + BaseDir: val, + } + c.cache.Init() + c.backend.CacheBackend = c.cache }, "DETECT_CHARSET": func(c *Collector, val string) { c.DetectCharset = isYesString(val) @@ -393,7 +398,14 @@ func MaxBodySize(sizeInBytes int) CollectorOption { // CacheDir specifies the location where GET requests are cached as files. func CacheDir(path string) CollectorOption { return func(c *Collector) { - c.CacheDir = path + if c.cache != nil { + c.cache.Close() + } + c.cache = &FileSystemCache{ + BaseDir: path, + } + c.cache.Init() + c.backend.CacheBackend = c.cache } } @@ -471,10 +483,12 @@ func (c *Collector) Init() { c.MaxRequests = 0 c.store = &storage.InMemoryStorage{} c.store.Init() + c.cache = &NullCache{} + c.cache.Init() c.MaxBodySize = 10 * 1024 * 1024 c.backend = &httpBackend{} jar, _ := cookiejar.New(nil) - c.backend.Init(jar) + c.backend.Init(jar, c.cache) c.backend.Client.CheckRedirect = c.checkRedirectFunc() c.wg = &sync.WaitGroup{} c.lock = &sync.RWMutex{} @@ -699,7 +713,7 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers}) return !request.abort } - response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir) + response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc) if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok { request.ProxyURL = proxyURL } @@ -1027,6 +1041,16 @@ func (c *Collector) SetStorage(s storage.Storage) error { return nil } +// SetCache overrides the default in-memory storage. +// Storage stores scraping related data like cookies and visited urls +func (c *Collector) SetCache(cache Cache) error { + if err := cache.Init(); err != nil { + return err + } + c.cache = cache + return nil +} + // SetProxy sets a proxy for the collector. This method overrides the previously // used http.Transport if the type of the transport is not http.RoundTripper. // The proxy type is determined by the URL scheme. "http" @@ -1295,7 +1319,6 @@ func (c *Collector) Clone() *Collector { return &Collector{ AllowedDomains: c.AllowedDomains, AllowURLRevisit: c.AllowURLRevisit, - CacheDir: c.CacheDir, DetectCharset: c.DetectCharset, DisallowedDomains: c.DisallowedDomains, ID: atomic.AddUint32(&collectorCounter, 1), diff --git a/colly_test.go b/colly_test.go index 91f1441e..a632edd5 100644 --- a/colly_test.go +++ b/colly_test.go @@ -368,8 +368,9 @@ var newCollectorTests = map[string]func(*testing.T){ } { c := NewCollector(CacheDir(path)) - if got, want := c.CacheDir, path; got != want { - t.Fatalf("c.CacheDir = %q, want %q", got, want) + fileSystemCache := c.backend.CacheBackend.(*FileSystemCache) + if got, want := fileSystemCache.BaseDir, path; got != want { + t.Fatalf("c.backend.CacheBackend.BaseDir = %q, want %q", got, want) } } }, diff --git a/http_backend.go b/http_backend.go index 0b201d23..26adcd52 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,15 +15,10 @@ package colly import ( - "crypto/sha1" - "encoding/gob" - "encoding/hex" "io" "io/ioutil" "math/rand" "net/http" - "os" - "path" "regexp" "strings" "sync" @@ -35,9 +30,10 @@ import ( ) type httpBackend struct { - LimitRules []*LimitRule - Client *http.Client - lock *sync.RWMutex + LimitRules []*LimitRule + Client *http.Client + lock *sync.RWMutex + CacheBackend Cache } type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool @@ -94,13 +90,14 @@ func (r *LimitRule) Init() error { return nil } -func (h *httpBackend) Init(jar http.CookieJar) { +func (h *httpBackend) Init(jar http.CookieJar, cache Cache) { rand.Seed(time.Now().UnixNano()) h.Client = &http.Client{ Jar: jar, Timeout: 10 * time.Second, } h.lock = &sync.RWMutex{} + h.CacheBackend = cache } // Match checks that the domain parameter triggers the rule @@ -129,42 +126,21 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule { return nil } -func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) { - if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { - return h.Do(request, bodySize, checkHeadersFunc) - } - sum := sha1.Sum([]byte(request.URL.String())) - hash := hex.EncodeToString(sum[:]) - dir := path.Join(cacheDir, hash[:2]) - filename := path.Join(dir, hash) - if file, err := os.Open(filename); err == nil { - resp := new(Response) - err := gob.NewDecoder(file).Decode(resp) - file.Close() +func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { + if resp, err := h.CacheBackend.Get(request); err == nil { checkHeadersFunc(request, resp.StatusCode, *resp.Headers) if resp.StatusCode < 500 { return resp, err } } + resp, err := h.Do(request, bodySize, checkHeadersFunc) if err != nil || resp.StatusCode >= 500 { return resp, err } - if _, err := os.Stat(dir); err != nil { - if err := os.MkdirAll(dir, 0750); err != nil { - return resp, err - } - } - file, err := os.Create(filename + "~") - if err != nil { - return resp, err - } - if err := gob.NewEncoder(file).Encode(resp); err != nil { - file.Close() - return resp, err - } - file.Close() - return resp, os.Rename(filename+"~", filename) + + err = h.CacheBackend.Put(request, resp) + return resp, err } func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) {