From b1469fbef56e939746e09f3900b3b1aedb70b8aa Mon Sep 17 00:00:00 2001 From: Seth Davis Date: Fri, 3 Nov 2023 12:07:02 -0400 Subject: [PATCH] Fix attempt to read gzip content from responses that cannot contain content --- colly_test.go | 21 +++++++++++++++++++++ http_backend.go | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/colly_test.go b/colly_test.go index e330fc2e..251296de 100644 --- a/colly_test.go +++ b/colly_test.go @@ -131,6 +131,11 @@ func newUnstartedTestServer() *httptest.Server { w.Write([]byte("ok")) }) + mux.HandleFunc("/204_enc_gzip", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Encoding", "gzip") + w.WriteHeader(204) + }) + mux.HandleFunc("/500", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.WriteHeader(500) @@ -1372,6 +1377,22 @@ func TestParseHTTPErrorResponse(t *testing.T) { } +func TestGzipEncodingNoContent(t *testing.T) { + // This is a regression test to ensure successful visits on + // servers that send a "Content-Encoding: gzip" header with + // responses that cannot contain content + ts := newTestServer() + defer ts.Close() + + c := NewCollector( + // Allow parsing 204 responses + ParseHTTPErrorResponse(), + ) + if err := c.Visit(ts.URL + "/204_enc_gzip"); err != nil { + t.Errorf("visit failed: %v", err) + } +} + func TestHTMLElement(t *testing.T) { ctx := &Context{} resp := &Response{ diff --git a/http_backend.go b/http_backend.go index e580f7a2..d7986d97 100644 --- a/http_backend.go +++ b/http_backend.go @@ -201,6 +201,14 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c bodyReader = io.LimitReader(bodyReader, int64(bodySize)) } contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) + + if strings.Contains(contentEncoding, "gzip") && !res.Uncompressed && (res.StatusCode < 200 || res.StatusCode == 204 || res.StatusCode == 304) { + // RFC 9110, section 15: 1xx, 204, and 304 responses cannot contain content. + // However, some servers may still send "Content-Encoding: gzip" in these scenarios + // so mark the response as uncompressed to avoid trying to read gzip data below. + res.Uncompressed = true + } + if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { bodyReader, err = gzip.NewReader(bodyReader) if err != nil {