diff --git a/colly_test.go b/colly_test.go index 14812e65..c804120a 100644 --- a/colly_test.go +++ b/colly_test.go @@ -44,13 +44,15 @@ Disallow: /disallowed Disallow: /allowed*q= ` -const testXml = ` +const testXML = ` Test Page This is a test page This is a test paragraph ` +const custom404 = `404 not found` + func newTestServer() *httptest.Server { mux := http.NewServeMux() @@ -77,13 +79,17 @@ func newTestServer() *httptest.Server { mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/xml") - w.Write([]byte(testXml)) + w.Write([]byte(testXML)) }) mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) { ww := gzip.NewWriter(w) defer ww.Close() - ww.Write([]byte(testXml)) + ww.Write([]byte(testXML)) + }) + + mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) { + http.Error(w, custom404, http.StatusNotFound) }) mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { @@ -1431,6 +1437,35 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) { testCollectorOnXMLWithXML(t, "/test.xml.gz") } +func TestCollectorNonexistentXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector(ParseHTTPErrorResponse()) + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/nonexistent.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + func TestCollectorVisitWithTrace(t *testing.T) { ts := newTestServer() defer ts.Close() diff --git a/http_backend.go b/http_backend.go index 0b201d23..e35a8905 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,6 +15,7 @@ package colly import ( + "bufio" "crypto/sha1" "encoding/gob" "encoding/hex" @@ -201,11 +202,23 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c } contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(request.URL.Path), ".xml.gz")) { - bodyReader, err = gzip.NewReader(bodyReader) + // Even if URL contains .xml.gz, it doesn't mean that we get gzip + // compressed data back. We might get 404 error page instead, + // for example. So check gzip magic bytes. + bufReader := bufio.NewReader(bodyReader) + bodyReader = bufReader + magic, err := bufReader.Peek(2) if err != nil { return nil, err } - defer bodyReader.(*gzip.Reader).Close() + // gzip magic, as specified in RFC 1952 + if magic[0] == 0x1f && magic[1] == 0x8b { + bodyReader, err = gzip.NewReader(bufReader) + if err != nil { + return nil, err + } + defer bodyReader.(*gzip.Reader).Close() + } } body, err := ioutil.ReadAll(bodyReader) if err != nil {