~williewillus/public-inbox

Add safe downloading v1 PROPOSED

: 1
 Add safe downloading

 2 files changed, 85 insertions(+), 1 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~williewillus/public-inbox/patches/23993/mbox | git am -3
Learn more about email & git
View this thread in the archives

[PATCH] Add safe downloading Export this patch

From: gamma-delta <29877714+gamma-delta@users.noreply.github.com>

---
 main.rkt              | 68 ++++++++++++++++++++++++++++++++++++++++++-
 scribblings/r16.scrbl | 18 ++++++++++++
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/main.rkt b/main.rkt
index ac6be6f..112048c 100755
--- a/main.rkt
+++ b/main.rkt
@@ -8,7 +8,7 @@

 (prefix-in ev: "evaluator.rkt")
 "log.rkt"
 (only-in net/url get-pure-port string->url)
 (only-in net/url get-pure-port string->url url url-host url-scheme)
 json
 threading)

@@ -368,6 +368,70 @@
         ; If empty byte string returned, return #f
         (and data (positive? (bytes-length data)) data)))))))

; Tree of dot-separated domain names, right to left.
; Each domain part is mapped either to a further tree of names,
; or #t. If you traverse the tree and reach a #t, the domain is OK;
; if you traverse the tree and either have a part not in the tree or
; run out of path parts, the domain is denied.
(define ok-domains (hash
  "com" (hash
    "discord" (hash
      "cdn" #t)
    "imgur" (hash
      "i" #t)
    "githubusercontent" (hash
      "gist" #t)
    "scryfall" (hash
      "c1" #t))
  "it" (hash
    "redd" (hash
      "i" #t))))
; pass listof strings right-to-left like (list "com" "discord" "cdn")
(define (vet-domain domain allow-tree) 
  (if (empty? domain) 
    #f ; Insta-deny it if we're out of segments (for example "discord.com/something")
    (match (hash-ref allow-tree (first domain) #f)
      [#f #f] ; Didn't know about this segment
      [#t #t] ; this segment is OK! (Overly specific is ok so no need to check for (empty? rest))
      [subtree (vet-domain (rest domain) subtree)] ; Keep recursing down
)))
(define max-download-bytes 500000) ; 500 kb
(define download-thread
  (thread
   (thunk
    (let loop ()
      (let* (
        [message (thread-receive)]
        [result (with-handlers ([exn:fail? identity]) (let* (
          [dl-url (car message)]
          [port (get-pure-port dl-url)]
          [payload (port->bytes port)]
          [amt (bytes-length payload)]
          [_ (if (< amt max-download-bytes) 
            #t 
            (error (~a "got too many bytes (" amt ") from download (max is " max-download-bytes ")")))])
          payload))])
        (channel-put (cdr message) result))
    (loop))
)))
(define/contract (checked-download dl-url-string) (-> string? bytes?) (let* (
  [dl-url (string->url dl-url-string)]
  ; If no scheme is provided add https as convenience
  [dl-url (if (false? (url-scheme dl-url)) (struct-copy url dl-url [scheme "https"]) dl-url)]
  ; If you're going to yell at me for bad code, yell at me for the indentation ok?
  [_ (if (member (url-scheme dl-url) (list "http" "https"))
    #t
    (error (~a "`" (url-scheme dl-url) "` is not an allowed scheme in the url `" dl-url-string "`")))]
  [domain (url-host dl-url)]
  [domain-segments (string-split domain "." #:trim? #f)]
  [vetted-ok (vet-domain (reverse domain-segments) ok-domains)]

  [_ (if vetted-ok #t (error (~a "`" domain "` is not an allowed domain in the url `" dl-url-string "`")))]
  [ch (make-channel)]
  [_ (thread-send download-thread (cons dl-url ch))])
  (channel-get ch)
))

(define (evaluation-ctx trick client message db context-id args parent-ctx)
  (let* ([placeholder (make-placeholder #f)]
         [ctx
@@ -390,6 +454,8 @@
            (message-author   . ,(message-author-id message))
            (read-storage     . ,(curry read-storage trick message))
            (write-storage    . ,(curry write-storage trick message))
            (download         . ,checked-download)
            (allowed-domains  . ,ok-domains)
            (parent-context   . ,parent-ctx))])
    (placeholder-set! placeholder (make-hash ctx))
    (cons (make-reader-graph ctx) '(threading))))
diff --git a/scribblings/r16.scrbl b/scribblings/r16.scrbl
index 940f851..516b5c4 100644
--- a/scribblings/r16.scrbl
+++ b/scribblings/r16.scrbl
@@ -74,6 +74,24 @@ This will always be a no-op when invoked from the eval command.
Thunk that deletes the message that invoked this sandbox.
}

@defproc[(download [url string?]) bytes?]{
Downloads data from the given URL and returns it as bytes.

Only certain domains are allowed in the URL; look at @racket[allowed-domains] for more info.
}

@defthing[allowed-domains (hash/c string? any/c)]{
Each domain part is mapped either to a further tree of names,
or @racket[#t]. If you traverse the tree and reach a @racket[#t], the domain is OK;
if you traverse the tree and either have a part not in the tree or
run out of path parts, the domain is denied.

This is used to vet domains passed to @racket[download].

The tree is from right-to-left; to vet @racket[cdn.discord.com], you would check @racket[com],
then check @racket[discord] in the found sub-tree, and then check @racket[cdn].
}

@defthing[parent-context (or/c (hash/c symbol? any/c) #f)]{
Mapping of all the above symbols for the trick calling this one, or @racket[#f] if this trick is the top level invocation.
}
-- 
2.32.0.windows.2