Goldstein: 1 Better URL cleaning for display 6 files changed, 90 insertions(+), 18 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~bouncepaw/betula/patches/47013/mbox | git am -3Learn more about email & git
This uses net/url standard package (+ x/net/idna) to clean up URLs instead of doing it via string manipulation. In particular, this solves two problems: 1. In URLs like `gopher://example.org/foo` path was previously incorrectly detected by splitting on the first `/`, which is in this case a scheme separator. 2. Punycode is now decoded on display. Fixes: https://todo.sr.ht/~bouncepaw/betula/90 --- Makefile | 1 + go.mod | 5 +++- go.sum | 2 ++ types/types.go | 70 ++++++++++++++++++++++++++++++++++++++------- types/types_test.go | 21 ++++++++++++++ web/templates.go | 9 ++---- 6 files changed, 90 insertions(+), 18 deletions(-) create mode 100644 types/types_test.go diff --git a/Makefile b/Makefile index ec9328b..b5f5913 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,7 @@ clean: test: clean betula go test ./db + go test ./types go test ./feeds go test ./readpage go test ./activities diff --git a/go.mod b/go.mod index e136f3d..2ffc067 100644 --- a/go.mod +++ b/go.mod @@ -10,4 +10,7 @@ require ( golang.org/x/net v0.5.0 ) -require github.com/kr/pretty v0.3.1 // indirect +require ( + github.com/kr/pretty v0.3.1 // indirect + golang.org/x/text v0.6.0 // indirect +) diff --git a/go.sum b/go.sum index ba171c2..1ba4365 100644 --- a/go.sum +++ b/go.sum @@ -16,3 +16,5 @@ golang.org/x/crypto v0.5.0 h1:U/0M97KRkSFvyD/3FSmdP5W5swImpNgle/EHFhOsQPE= golang.org/x/crypto v0.5.0/go.mod h1:NK/OQwhpMQP3MwtdjgLlYHnH9ebylxKWv3e0fK+mkQU= golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw= golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= +golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= diff --git a/types/types.go b/types/types.go index 838a420..208382e 100644 --- a/types/types.go +++ b/types/types.go @@ -6,6 +6,7 @@ import ( "html/template" "math" "net/url" + "golang.org/x/net/idna" "strings" "time" @@ -152,18 +153,67 @@ type RepostInfo struct { // TimeLayout is the time layout used across Betula. const TimeLayout = "2006-01-02 15:04:05" -// CleanerLink returns the link a with https:// or http:// prefix and the / suffix and percent-encoding reversed. -func CleanerLink(a string) string { - b := strings.TrimPrefix(a, "https://") - c := strings.TrimPrefix(b, "http://") - // Gemini, Gopher, FTP, Mail are not stripped, to emphasize them, when they are. - d := strings.TrimSuffix(c, "/") - e, err := url.QueryUnescape(d) +// CleanerLinkParts returns the link a with https:// or http:// prefix and the / suffix, +// percent-encoding reversed and Punycode decoded. +// +// Link is returned in two parts: scheme + authority and the rest (path, query, fragment). +func CleanerLinkParts(a string) (string, string) { + u, err := url.Parse(a) if err != nil { - // Better luck next time. - return d + // Welp, we tried our best. + return a, "" + } + + var hostPart string + if u.Scheme != "http" && u.Scheme != "https" { + // Gemini, Gopher, FTP, Mail etc are not stripped to emphasize them. + hostPart += fmt.Sprintf("%s://", u.Scheme) + } + + if u.User != nil { + hostPart += u.User.String() + } + + if u.Host != "" { + host, err := idna.ToUnicode(u.Host) + if err != nil { + // Was worth a shot. + host = u.Host + } + hostPart += host + } + + pathPart := "" + + path := strings.TrimSuffix(u.Path, "/") + if path != "" { + if !strings.HasPrefix(path, "/") { + pathPart += "/" + } + pathPart += path + } + + if u.RawQuery != "" { + query, err := url.QueryUnescape(u.RawQuery) + if err != nil { + // Better luck next time. + query = u.RawQuery + } + + pathPart += "?" + query } - return e + + if u.Fragment != "" { + pathPart += "#" + u.Fragment + } + + return hostPart, pathPart +} + +// Same as CleanerLinkParts, but merges the parts back into one url. +func CleanerLink(a string) string { + left, right := CleanerLinkParts(a) + return left + right } const ( diff --git a/types/types_test.go b/types/types_test.go new file mode 100644 index 0000000..495009f --- /dev/null +++ b/types/types_test.go @@ -0,0 +1,21 @@ +package types + +import "testing" + +func TestCleanerLinkParts(t *testing.T) { + check := func(url string, expectedLeft string, expectedRight string) { + left, right := CleanerLinkParts(url) + if left != expectedLeft { + t.Errorf("Wrong left part for `%s`: expected `%s`, got `%s`", url, expectedLeft, left) + } + if right != expectedRight { + t.Errorf("Wrong right part for `%s`: expected `%s`, got `%s`", url, expectedRight, right) + } + } + + check("gopher://foo.bar/baz", "gopher://foo.bar", "/baz") + check("https://example.com/", "example.com", "") + check("http://xn--d1ahgkh6g.xn--90aczn5ei/%F0%9F%96%A4", "юникод.любовь", "/🖤") + check("http://юникод.любовь/🖤", "юникод.любовь", "/🖤") + check("http://example.com/?query=param#a/b", "example.com", "?query=param#a/b") +} diff --git a/web/templates.go b/web/templates.go index db1c3b1..8751414 100644 --- a/web/templates.go +++ b/web/templates.go @@ -10,7 +10,6 @@ import ( "log" "math/rand" "net/http" - "strings" "time" ) @@ -88,12 +87,8 @@ var funcMapForPosts = template.FuncMap{ return t.Format("2006-01-02 15:04") }, "shortenLink": func(a string) template.HTML { - b := types.CleanerLink(a) - before, after, _ := strings.Cut(b, "/") - result := before - if after != "" { - result += fmt.Sprintf(`<span class="url-path">/%s</span>`, after) - } + result, pathPart := types.CleanerLinkParts(a) + result += fmt.Sprintf(`<span class="url-path">%s</span>`, pathPart) return template.HTML(result) }, "mycomarkup": myco.MarkupToHTML, -- 2.42.0
Thanks! Kudos for writing the test. I have found a bug though, I think you should fix it and write an additional test for it. If you save a mailto: link such as mailto:betula@example.org, it gets displayed as mailto://. I expect a similar problem to happen with schemes like tel: and others that do not have a slash after the colon.