~sircmpwn/sr.ht-dev

pages.sr.ht: contrib: add tool to purge old sites from storage v1 APPLIED

Conrad Hoffmann: 1
 contrib: add tool to purge old sites from storage

 2 files changed, 166 insertions(+), 0 deletions(-)
#1145974 alpine.yml success
#1145975 archlinux.yml failed
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/sr.ht-dev/patches/49283/mbox | git am -3
Learn more about email & git

[PATCH pages.sr.ht] contrib: add tool to purge old sites from storage Export this patch

---
Potentially dangerous, for obvious reasons. Please scrutinize.

 contrib/cleanup-old-versions/README.md |   7 ++
 contrib/cleanup-old-versions/main.go   | 159 +++++++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 contrib/cleanup-old-versions/README.md
 create mode 100644 contrib/cleanup-old-versions/main.go

diff --git a/contrib/cleanup-old-versions/README.md b/contrib/cleanup-old-versions/README.md
new file mode 100644
index 0000000..beeef05
--- /dev/null
+++ b/contrib/cleanup-old-versions/README.md
@@ -0,0 +1,7 @@
# Delete obsolete site versions

Currently, old versions of published sites are left to accumulate in storage.
This tool to finds these old, unreachable site versions and deletes them. It
should be safe to re-run this at any time (or re-start it in case of errors).

Build with `go build` and run the resulting executable on pages.sr.ht.
diff --git a/contrib/cleanup-old-versions/main.go b/contrib/cleanup-old-versions/main.go
new file mode 100644
index 0000000..2fe4972
--- /dev/null
+++ b/contrib/cleanup-old-versions/main.go
@@ -0,0 +1,159 @@
package main

import (
	"context"
	"database/sql"
	"log"
	"path"
	"strings"
	"sync"
	"time"

	"git.sr.ht/~sircmpwn/core-go/config"
	"git.sr.ht/~sircmpwn/core-go/s3"
	_ "github.com/lib/pq"
	"github.com/minio/minio-go/v7"
)

type Site struct {
	domain string
	key    string
}

type Version struct {
	id  string
	key string
}

func NewSite(key string) Site {
	return Site{
		domain: path.Base(path.Clean(key)),
		key:    key,
	}
}

func NewVersion(key string) Version {
	return Version{
		id:  path.Base(path.Clean(key)),
		key: key,
	}
}

func contains(list []string, item string) bool {
	for _, i := range list {
		if i == item {
			return true
		}
	}
	return false
}

func getCurrentVersions(db *sql.DB, domain string) ([]string, error) {
	q := `SELECT version FROM "sites" WHERE domain = $1`
	rows, err := db.Query(q, domain)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	var currentVersions []string

	for rows.Next() {
		var current string
		if err := rows.Scan(&current); err != nil {
			return nil, err
		}
		currentVersions = append(currentVersions, current)
	}
	return currentVersions, nil
}

func main() {
	log.Println("Starting...")

	conf := config.LoadConfig(":5112")

	pgcs, ok := conf.Get("pages.sr.ht", "connection-string")
	if !ok {
		log.Fatalf("No connection string provided in config.ini")
	}

	db, err := sql.Open("postgres", pgcs)
	if err != nil {
		log.Fatalf("Failed to open a database connection: %v", err)
	}

	mc, err := s3.NewClient(conf)
	if err != nil {
		log.Fatal(err)
	}

	bucket, _ := conf.Get("pages.sr.ht", "s3-bucket")
	prefix, _ := conf.Get("pages.sr.ht", "s3-prefix")

	if prefix != "" && !strings.HasSuffix(prefix, "/") {
		prefix = prefix + "/"
	}

	ctx := context.Background()
	listOpts := minio.ListObjectsOptions{
		Prefix: prefix + "sites/",
	}

	var wg sync.WaitGroup

	for entry := range mc.ListObjects(ctx, bucket, listOpts) {
		// Space goroutines a bit apart...
		time.Sleep(1 * time.Second)

		wg.Add(1)
		go func(site Site) {
			defer wg.Done()
			log.Printf("%s: processing...\n", site.domain)

			var versions []Version
			lo := minio.ListObjectsOptions{
				Prefix: site.key,
			}

			for v := range mc.ListObjects(ctx, bucket, lo) {
				versions = append(versions, NewVersion(v.Key))
			}

			// The listed versions can potentially contain one
			// which is being currently uploaded and is not yet in
			// the database. Make sure to wait for the timeout
			// period of upload requests (plus a little) so that
			// the version shows up in the database.
			// If yet another upload starts during that time, no
			// problem: the version it created is not in our
			// listing and will not be touched.
			log.Printf("%s: waiting for potential uploads to settle...\n", site.domain)
			time.Sleep(35 * time.Second)

			current, err := getCurrentVersions(db, site.domain)
			if err != nil {
				log.Fatalf("%s: failed to get current versions: %s", site.domain, err.Error())
			}

			log.Printf("%s: current versions: %s", site.domain, strings.Join(current, ", "))
			for _, v := range versions {
				if contains(current, v.id) {
					continue
				}
				log.Printf("%s: deleting %s", site.domain, v.key)
				lo := minio.ListObjectsOptions{
					Prefix:    v.key,
					Recursive: true,
				}
				objects := mc.ListObjects(ctx, bucket, lo)
				result := mc.RemoveObjects(ctx, bucket, objects, minio.RemoveObjectsOptions{})
				for roe := range result {
					log.Printf("%s: error deleting object %s: %s", site.domain, roe.ObjectName, roe.Err.Error())
				}
			}
		}(NewSite(entry.Key))
	}
	wg.Wait()
	log.Println("Done.")
}
-- 
2.43.0
pages.sr.ht/patches: FAILED in 59s

[contrib: add tool to purge old sites from storage][0] from [Conrad Hoffmann][1]

[0]: https://lists.sr.ht/~sircmpwn/sr.ht-dev/patches/49283
[1]: mailto:ch@bitfehler.net

✗ #1145975 FAILED  pages.sr.ht/patches/archlinux.yml https://builds.sr.ht/~sircmpwn/job/1145975
✓ #1145974 SUCCESS pages.sr.ht/patches/alpine.yml    https://builds.sr.ht/~sircmpwn/job/1145974
There's a stray in the README.
Thanks!

To git@git.sr.ht:~sircmpwn/pages.sr.ht
   6660d3d..ce6df7d  master -> master