diff --git a/colly.go b/colly.go index 451b6766..ae74b7c3 100644 --- a/colly.go +++ b/colly.go @@ -1117,9 +1117,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) { } func (c *Collector) handleOnHTML(resp *Response) error { - if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") { + if len(c.htmlCallbacks) == 0 { return nil } + + contentType := resp.Headers.Get("Content-Type") + if contentType == "" { + contentType = http.DetectContentType(resp.Body) + } + // implementation of mime.ParseMediaType without parsing the params + // part + mediatype, _, _ := strings.Cut(contentType, ";") + mediatype = strings.TrimSpace(strings.ToLower(mediatype)) + + // TODO we also want to parse application/xml as XHTML if it has + // appropriate doctype + switch mediatype { + case "text/html", "application/xhtml+xml": + default: + return nil + } + doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body)) if err != nil { return err diff --git a/colly_test.go b/colly_test.go index 2382ecb1..e70d2774 100644 --- a/colly_test.go +++ b/colly_test.go @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server { }) mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/html") + if r.URL.Query().Get("no-content-type") != "" { + w.Header()["Content-Type"] = nil + } else { + w.Header().Set("Content-Type", "text/html") + } w.Write([]byte(` @@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) { } } +func TestCollectorContentSniffing(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + htmlCallbackCalled := false + + c.OnResponse(func(r *Response) { + if (*r.Headers)["Content-Type"] != nil { + t.Error("Content-Type unexpectedly not nil") + } + }) + + c.OnHTML("html", func(e *HTMLElement) { + htmlCallbackCalled = true + }) + + err := c.Visit(ts.URL + "/html?no-content-type=yes") + if err != nil { + t.Fatal(err) + } + + if !htmlCallbackCalled { + t.Error("OnHTML was not called") + } +} + func TestCollectorURLRevisit(t *testing.T) { ts := newTestServer() defer ts.Close()