Skip to content

Commit

Permalink
Unzip responses (#138)
Browse files Browse the repository at this point in the history
* Unzip responses

* Set default headers

* Add comment

* Do not set agent

* Bump version

* Update comment
  • Loading branch information
raviqqe authored Dec 17, 2020
1 parent 5dd0697 commit 1de8ae0
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 13 deletions.
2 changes: 1 addition & 1 deletion configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package main
import "time"

const (
version = "2.3.0"
version = "2.3.1"
agentName = "muffet"
concurrency = 1024
tcpTimeout = 5 * time.Second
Expand Down
4 changes: 2 additions & 2 deletions fake_http_response_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ func (r *fakeHTTPResponse) Header(name string) string {
return ""
}

func (r *fakeHTTPResponse) Body() []byte {
return r.body
func (r *fakeHTTPResponse) Body() ([]byte, error) {
return r.body, nil
}
3 changes: 3 additions & 0 deletions fasthttp_http_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ func (c *fasthttpHTTPClient) Get(u *url.URL, headers map[string]string) (httpRes
req.SetRequestURI(u.String())
req.SetConnectionClose()

// Some HTTP servers require "Accept" headers to be set explicitly.
req.Header.Add("Accept", "*/*")

for k, v := range headers {
req.Header.Add(k, v)
}
Expand Down
9 changes: 7 additions & 2 deletions fasthttp_http_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ func (r fasthttpHTTPResponse) Header(key string) string {
return string(r.response.Header.Peek(key))
}

func (r fasthttpHTTPResponse) Body() []byte {
return r.response.Body()
func (r fasthttpHTTPResponse) Body() ([]byte, error) {
switch string(r.response.Header.Peek("Content-Encoding")) {
case "gzip":
return r.response.BodyGunzip()
}

return r.response.Body(), nil
}
3 changes: 1 addition & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++Ja
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.17.0 h1:P8/koH4aSnJ4xbd0cUUFEGQs3jQqIxoDDyRQrUiAkqg=
github.com/valyala/fasthttp v1.17.0/go.mod h1:jjraHZVbKOXftJfsOYoAjaeygpj5hr8ermTRJNroD7A=
github.com/valyala/fasthttp v1.18.0 h1:IV0DdMlatq9QO1Cr6wGJPVW1sV1Q8HvZXAIcjorylyM=
github.com/valyala/fasthttp v1.18.0/go.mod h1:jjraHZVbKOXftJfsOYoAjaeygpj5hr8ermTRJNroD7A=
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
github.com/yhat/scrape v0.0.0-20161128144610-24b7890b0945 h1:6Ju8pZBYFTN9FaV/JvNBiIHcsgEmP4z4laciqjfjY8E=
Expand Down
2 changes: 1 addition & 1 deletion http_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ type httpResponse interface {
URL() string
StatusCode() int
Header(string) string
Body() []byte
Body() ([]byte, error)
}
7 changes: 6 additions & 1 deletion link_fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,12 @@ func (f *linkFetcher) sendRequest(s string) (int, *page, error) {
}
}

p, err := f.pageParser.Parse(r.URL(), r.Body())
bs, err := r.Body()
if err != nil {
return 0, nil, err
}

p, err := f.pageParser.Parse(r.URL(), bs)
if err != nil {
return 0, nil, err
}
Expand Down
14 changes: 12 additions & 2 deletions robots_txt_fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,20 @@ func newRobotsTxtFetcher(c httpClient) *robotsTxtFetcher {
func (f *robotsTxtFetcher) Fetch(uu *url.URL) (*robotstxt.RobotsData, error) {
u := *uu
u.Path = "robots.txt"

r, err := f.client.Get(&u, nil)
if err != nil {
return nil, fmt.Errorf("failed to fetch robots.txt: %v", err)
return nil, f.formatError(err)
}

bs, err := r.Body()
if err != nil {
return nil, f.formatError(err)
}

return robotstxt.FromBytes(r.Body())
return robotstxt.FromBytes(bs)
}

func (*robotsTxtFetcher) formatError(err error) error {
return fmt.Errorf("failed to fetch robots.txt: %v", err)
}
13 changes: 11 additions & 2 deletions sitemap_fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@ func (f *sitemapFetcher) Fetch(uu *url.URL) (map[string]struct{}, error) {

r, err := f.client.Get(&u, nil)
if err != nil {
return nil, fmt.Errorf("failed to GET sitemap.xml: %v", err)
return nil, f.formatGetError(err)
}

us := map[string]struct{}{}

err = sitemap.Parse(bytes.NewReader(r.Body()), func(e sitemap.Entry) error {
bs, err := r.Body()
if err != nil {
return nil, f.formatGetError(err)
}

err = sitemap.Parse(bytes.NewReader(bs), func(e sitemap.Entry) error {
us[e.GetLocation()] = struct{}{}

return nil
Expand All @@ -39,3 +44,7 @@ func (f *sitemapFetcher) Fetch(uu *url.URL) (map[string]struct{}, error) {

return us, nil
}

func (*sitemapFetcher) formatGetError(err error) error {
return fmt.Errorf("failed to GET sitemap.xml: %v", err)
}

0 comments on commit 1de8ae0

Please sign in to comment.