This uses net/url standard package (+ x/net/idna) to clean up URLs
instead of doing it via string manipulation. In particular, this solves
two problems:
1. In URLs like `gopher://example.org/foo` path was previously
incorrectly detected by splitting on the first `/`, which is in this
case a scheme separator.
2. Punycode is now decoded on display.
Fixes: https://todo.sr.ht/~bouncepaw/betula/90
---
Makefile | 1 +
go.mod | 5 ++-
go.sum | 2 ++
types/types.go | 78 +++++++++++++++++++++++++++++++++++++++------
types/types_test.go | 23 +++++++++++++
web/templates.go | 9 ++----
6 files changed, 100 insertions(+), 18 deletions(-)
create mode 100644 types/types_test.go
diff --git a/Makefile b/Makefile
index ec9328b..b5f5913 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,7 @@ clean:
test: clean betula
go test ./db
+ go test ./types
go test ./feeds
go test ./readpage
go test ./activities
diff --git a/go.mod b/go.mod
index e136f3d..2ffc067 100644
--- a/go.mod
+++ b/go.mod
@@ -10,4 +10,7 @@ require (
golang.org/x/net v0.5.0
)
-require github.com/kr/pretty v0.3.1 // indirect
+require (
+ github.com/kr/pretty v0.3.1 // indirect
+ golang.org/x/text v0.6.0 // indirect
+)
diff --git a/go.sum b/go.sum
index ba171c2..1ba4365 100644
--- a/go.sum
+++ b/go.sum
@@ -16,3 +16,5 @@ golang.org/x/crypto v0.5.0 h1:U/0M97KRkSFvyD/3FSmdP5W5swImpNgle/EHFhOsQPE=
golang.org/x/crypto v0.5.0/go.mod h1:NK/OQwhpMQP3MwtdjgLlYHnH9ebylxKWv3e0fK+mkQU=
golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw=
golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws=
+golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k=
+golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
diff --git a/types/types.go b/types/types.go
index 838a420..f21cb1d 100644
--- a/types/types.go
+++ b/types/types.go
@@ -6,6 +6,7 @@ import (
"html/template"
"math"
"net/url"
+ "golang.org/x/net/idna"
"strings"
"time"
@@ -152,18 +153,75 @@ type RepostInfo struct {
// TimeLayout is the time layout used across Betula.
const TimeLayout = "2006-01-02 15:04:05"
-// CleanerLink returns the link a with https:// or http:// prefix and the / suffix and percent-encoding reversed.
-func CleanerLink(a string) string {
- b := strings.TrimPrefix(a, "https://")
- c := strings.TrimPrefix(b, "http://")
- // Gemini, Gopher, FTP, Mail are not stripped, to emphasize them, when they are.
- d := strings.TrimSuffix(c, "/")
- e, err := url.QueryUnescape(d)
+// CleanerLinkParts returns the link a with https:// or http:// prefix and the / suffix,
+// percent-encoding reversed and Punycode decoded.
+//
+// Link is returned in two parts: scheme + authority and the rest (path, query, fragment).
+func CleanerLinkParts(a string) (string, string) {
+ u, err := url.Parse(a)
if err != nil {
- // Better luck next time.
- return d
+ // Welp, we tried our best.
+ return a, ""
+ }
+
+ var hostPart string
+ if u.Scheme != "http" && u.Scheme != "https" {
+ // Gemini, Gopher, FTP, Mail etc are not stripped to emphasize them.
+ hostPart += fmt.Sprintf("%s:", u.Scheme)
+ // "Opaque" is defined for schemes like `mailto:` or tel:`, where there is no `//`.
+ if u.Opaque == "" {
+ hostPart += "//"
+ }
}
- return e
+
+ if u.User != nil {
+ hostPart += u.User.String()
+ }
+
+ if u.Host != "" {
+ host, err := idna.ToUnicode(u.Host)
+ if err != nil {
+ // Was worth a shot.
+ host = u.Host
+ }
+ hostPart += host
+ }
+
+ if u.Opaque != "" {
+ hostPart += u.Opaque
+ }
+
+ pathPart := ""
+
+ path := strings.TrimSuffix(u.Path, "/")
+ if path != "" {
+ if !strings.HasPrefix(path, "/") {
+ pathPart += "/"
+ }
+ pathPart += path
+ }
+
+ if u.RawQuery != "" {
+ query, err := url.QueryUnescape(u.RawQuery)
+ if err != nil {
+ // Better luck next time.
+ query = u.RawQuery
+ }
+
+ pathPart += "?" + query
+ }
+
+ if u.Fragment != "" {
+ pathPart += "#" + u.Fragment
+ }
+
+ return hostPart, pathPart
+}
+
+// Same as CleanerLinkParts, but merges the parts back into one url.
+func CleanerLink(a string) string {
+ left, right := CleanerLinkParts(a)
+ return left + right
}
const (
diff --git a/types/types_test.go b/types/types_test.go
new file mode 100644
index 0000000..6eb27e0
--- /dev/null
+++ b/types/types_test.go
@@ -0,0 +1,23 @@
+package types
+
+import "testing"
+
+func TestCleanerLinkParts(t *testing.T) {
+ check := func(url string, expectedLeft string, expectedRight string) {
+ left, right := CleanerLinkParts(url)
+ if left != expectedLeft {
+ t.Errorf("Wrong left part for `%s`: expected `%s`, got `%s`", url, expectedLeft, left)
+ }
+ if right != expectedRight {
+ t.Errorf("Wrong right part for `%s`: expected `%s`, got `%s`", url, expectedRight, right)
+ }
+ }
+
+ check("gopher://foo.bar/baz", "gopher://foo.bar", "/baz")
+ check("https://example.com/", "example.com", "")
+ check("http://xn--d1ahgkh6g.xn--90aczn5ei/%F0%9F%96%A4", "юникод.любовь", "/🖤")
+ check("http://юникод.любовь/🖤", "юникод.любовь", "/🖤")
+ check("http://example.com/?query=param#a/b", "example.com", "?query=param#a/b")
+ check("mailto:user@example.com", "mailto:user@example.com", "")
+ check("tel:+55551234567", "tel:+55551234567", "")
+}
diff --git a/web/templates.go b/web/templates.go
index db1c3b1..8751414 100644
--- a/web/templates.go
+++ b/web/templates.go
@@ -10,7 +10,6 @@ import (
"log"
"math/rand"
"net/http"
- "strings"
"time"
)
@@ -88,12 +87,8 @@ var funcMapForPosts = template.FuncMap{
return t.Format("2006-01-02 15:04")
},
"shortenLink": func(a string) template.HTML {
- b := types.CleanerLink(a)
- before, after, _ := strings.Cut(b, "/")
- result := before
- if after != "" {
- result += fmt.Sprintf(`<span class="url-path">/%s</span>`, after)
- }
+ result, pathPart := types.CleanerLinkParts(a)
+ result += fmt.Sprintf(`<span class="url-path">%s</span>`, pathPart)
return template.HTML(result)
},
"mycomarkup": myco.MarkupToHTML,
--
2.42.0