From b2f13264cd5374db29ea3709a9ec8f24937cf60d Mon Sep 17 00:00:00 2001 From: Yota Toyama Date: Sat, 25 Nov 2023 19:41:51 +0900 Subject: [PATCH] Fix data URLs with spaces (#349) Close #345. # References - https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs --- link_finder.go | 25 +++++++++++++++++++++---- link_finder_test.go | 14 ++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/link_finder.go b/link_finder.go index ca2f804..43cf934 100644 --- a/link_finder.go +++ b/link_finder.go @@ -4,6 +4,7 @@ import ( "net/url" "regexp" "strings" + "unicode" "github.com/yhat/scrape" "golang.org/x/net/html" @@ -22,7 +23,7 @@ var atomToAttributes = map[atom.Atom][]string{ atom.Meta: {"content"}, } -var imageDescriptorPattern = regexp.MustCompile(" [^ ]*$") +var imageDescriptorPattern = regexp.MustCompile(`(\S)\s+\S+\s*$`) type linkFinder struct { linkFilterer linkFilterer @@ -43,7 +44,7 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error { ss := f.parseLinks(n, a) for _, s := range ss { - s := strings.TrimSpace(s) + s := f.trimUrl(s) if s == "" { continue @@ -67,14 +68,14 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error { return ls } -func (linkFinder) parseLinks(n *html.Node, a string) []string { +func (f linkFinder) parseLinks(n *html.Node, a string) []string { s := scrape.Attr(n, a) ss := []string{} switch a { case "srcset": for _, s := range strings.Split(s, ",") { - ss = append(ss, imageDescriptorPattern.ReplaceAllString(strings.TrimSpace(s), "")) + ss = append(ss, f.trimUrl(imageDescriptorPattern.ReplaceAllString(s, "$1"))) } case "content": switch scrape.Attr(n, "property") { @@ -87,3 +88,19 @@ func (linkFinder) parseLinks(n *html.Node, a string) []string { return ss } + +func (linkFinder) trimUrl(s string) string { + s = strings.TrimSpace(s) + + if !strings.HasPrefix(s, "data:") { + return s + } + + return strings.Map(func(r rune) rune { + if unicode.IsSpace(r) { + return -1 + } + + return r + }, s) +} diff --git a/link_finder_test.go b/link_finder_test.go index cfd241c..20a3072 100644 --- a/link_finder_test.go +++ b/link_finder_test.go @@ -189,6 +189,20 @@ func TestLinkFinderFindMetaTags(t *testing.T) { assert.Nil(t, err) } +func TestLinkFinderFindDataSchemeLinkWithSpaces(t *testing.T) { + b, err := url.Parse("http://foo.com") + assert.Nil(t, err) + + n, err := html.Parse(strings.NewReader( + htmlWithBody(``)), + ) + assert.Nil(t, err) + + ls := newTestLinkFinder().Find(n, b) + + assert.Len(t, ls, 0) +} + func TestLinkFinderIgnoreMetaTags(t *testing.T) { b, err := url.Parse("http://foo.com") assert.Nil(t, err)