~bouncepaw/betula

Better URL cleaning for display v2 APPLIED

Goldstein: 1
 Better URL cleaning for display

 6 files changed, 100 insertions(+), 18 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~bouncepaw/betula/patches/47022/mbox | git am -3
Learn more about email & git

[PATCH v2] Better URL cleaning for display Export this patch

This uses net/url standard package (+ x/net/idna) to clean up URLs
instead of doing it via string manipulation. In particular, this solves
two problems:

1. In URLs like `gopher://example.org/foo` path was previously
   incorrectly detected by splitting on the first `/`, which is in this
   case a scheme separator.

2. Punycode is now decoded on display.

Fixes: https://todo.sr.ht/~bouncepaw/betula/90
---
 Makefile            |  1 +
 go.mod              |  5 ++-
 go.sum              |  2 ++
 types/types.go      | 78 +++++++++++++++++++++++++++++++++++++++------
 types/types_test.go | 23 +++++++++++++
 web/templates.go    |  9 ++----
 6 files changed, 100 insertions(+), 18 deletions(-)
 create mode 100644 types/types_test.go

diff --git a/Makefile b/Makefile
index ec9328b..b5f5913 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,7 @@ clean:

test: clean betula
	go test ./db
	go test ./types
	go test ./feeds
	go test ./readpage
	go test ./activities
diff --git a/go.mod b/go.mod
index e136f3d..2ffc067 100644
--- a/go.mod
+++ b/go.mod
@@ -10,4 +10,7 @@ require (
	golang.org/x/net v0.5.0
)

require github.com/kr/pretty v0.3.1 // indirect
require (
	github.com/kr/pretty v0.3.1 // indirect
	golang.org/x/text v0.6.0 // indirect
)
diff --git a/go.sum b/go.sum
index ba171c2..1ba4365 100644
--- a/go.sum
+++ b/go.sum
@@ -16,3 +16,5 @@ golang.org/x/crypto v0.5.0 h1:U/0M97KRkSFvyD/3FSmdP5W5swImpNgle/EHFhOsQPE=
golang.org/x/crypto v0.5.0/go.mod h1:NK/OQwhpMQP3MwtdjgLlYHnH9ebylxKWv3e0fK+mkQU=
golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw=
golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws=
golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k=
golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
diff --git a/types/types.go b/types/types.go
index 838a420..f21cb1d 100644
--- a/types/types.go
+++ b/types/types.go
@@ -6,6 +6,7 @@ import (
	"html/template"
	"math"
	"net/url"
	"golang.org/x/net/idna"
	"strings"
	"time"

@@ -152,18 +153,75 @@ type RepostInfo struct {
// TimeLayout is the time layout used across Betula.
const TimeLayout = "2006-01-02 15:04:05"

// CleanerLink returns the link a with https:// or http:// prefix and the / suffix and percent-encoding reversed.
func CleanerLink(a string) string {
	b := strings.TrimPrefix(a, "https://")
	c := strings.TrimPrefix(b, "http://")
	// Gemini, Gopher, FTP, Mail are not stripped, to emphasize them, when they are.
	d := strings.TrimSuffix(c, "/")
	e, err := url.QueryUnescape(d)
// CleanerLinkParts returns the link a with https:// or http:// prefix and the / suffix,
// percent-encoding reversed and Punycode decoded.
//
// Link is returned in two parts: scheme + authority and the rest (path, query, fragment).
func CleanerLinkParts(a string) (string, string) {
	u, err := url.Parse(a)
	if err != nil {
		// Better luck next time.
		return d
		// Welp, we tried our best.
		return a, ""
	}

	var hostPart string
	if u.Scheme != "http" && u.Scheme != "https" {
		// Gemini, Gopher, FTP, Mail etc are not stripped to emphasize them.
		hostPart += fmt.Sprintf("%s:", u.Scheme)
		// "Opaque" is defined for schemes like `mailto:` or tel:`, where there is no `//`.
		if u.Opaque == "" {
			hostPart += "//"
		}
	}
	return e

	if u.User != nil {
		hostPart += u.User.String()
	}

	if u.Host != "" {
		host, err := idna.ToUnicode(u.Host)
		if err != nil {
			// Was worth a shot.
			host = u.Host
		}
		hostPart += host
	}

	if u.Opaque != "" {
		hostPart += u.Opaque
	}

	pathPart := ""

	path := strings.TrimSuffix(u.Path, "/")
	if path != "" {
		if !strings.HasPrefix(path, "/") {
			pathPart += "/"
		}
		pathPart += path
	}

	if u.RawQuery != "" {
		query, err := url.QueryUnescape(u.RawQuery)
		if err != nil {
			// Better luck next time.
			query = u.RawQuery
		}

		pathPart += "?" + query
	}

	if u.Fragment != "" {
		pathPart += "#" + u.Fragment
	}

	return hostPart, pathPart
}

// Same as CleanerLinkParts, but merges the parts back into one url.
func CleanerLink(a string) string {
	left, right := CleanerLinkParts(a)
	return left + right
}

const (
diff --git a/types/types_test.go b/types/types_test.go
new file mode 100644
index 0000000..6eb27e0
--- /dev/null
+++ b/types/types_test.go
@@ -0,0 +1,23 @@
package types

import "testing"

func TestCleanerLinkParts(t *testing.T) {
	check := func(url string, expectedLeft string, expectedRight string) {
		left, right := CleanerLinkParts(url)
		if left != expectedLeft {
			t.Errorf("Wrong left part for `%s`: expected `%s`, got `%s`", url, expectedLeft, left)
		}
		if right != expectedRight {
			t.Errorf("Wrong right part for `%s`: expected `%s`, got `%s`", url, expectedRight, right)
		}
	}

	check("gopher://foo.bar/baz", "gopher://foo.bar", "/baz")
	check("https://example.com/", "example.com", "")
	check("http://xn--d1ahgkh6g.xn--90aczn5ei/%F0%9F%96%A4", "юникод.любовь", "/🖤")
	check("http://юникод.любовь/🖤", "юникод.любовь", "/🖤")
	check("http://example.com/?query=param#a/b", "example.com", "?query=param#a/b")
	check("mailto:user@example.com", "mailto:user@example.com", "")
	check("tel:+55551234567", "tel:+55551234567", "")
}
diff --git a/web/templates.go b/web/templates.go
index db1c3b1..8751414 100644
--- a/web/templates.go
+++ b/web/templates.go
@@ -10,7 +10,6 @@ import (
	"log"
	"math/rand"
	"net/http"
	"strings"
	"time"
)

@@ -88,12 +87,8 @@ var funcMapForPosts = template.FuncMap{
		return t.Format("2006-01-02 15:04")
	},
	"shortenLink": func(a string) template.HTML {
		b := types.CleanerLink(a)
		before, after, _ := strings.Cut(b, "/")
		result := before
		if after != "" {
			result += fmt.Sprintf(`<span class="url-path">/%s</span>`, after)
		}
		result, pathPart := types.CleanerLinkParts(a)
		result += fmt.Sprintf(`<span class="url-path">%s</span>`, pathPart)
		return template.HTML(result)
	},
	"mycomarkup": myco.MarkupToHTML,
-- 
2.42.0
LGTM. Thanks!