[PATCH] crawler: check content length before downloading
Export this patch
The limit is 20 MiB. It's also enforced through use of io.LimitedReader.
Implements: https://todo.sr.ht/~sircmpwn/searchhut/22
---
crawler/index.go | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/crawler/index.go b/crawler/index.go
index 9ffb091..3ea7949 100644
--- a/crawler/index.go
+++ b/crawler/index.go
@@ -63,14 +63,15 @@ func (c *Crawler) Index(ctx context.Context, page scheduledPage) error {
url = resp.Request.URL
defer resp.Body.Close()
- hash := sha512.New()
- counter := counterWriter{}
- reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))
-
if !c.checkResponse(resp, page) {
return nil
}
+ hash := sha512.New()
+ counter := counterWriter{}
+ reader := io.LimitReader(resp.Body, 20*1024*1024)
+ reader = io.TeeReader(reader, io.MultiWriter(&counter, hash))
+
node, err := html.Parse(reader)
if err != nil {
return err
@@ -225,6 +226,9 @@ func (c *Crawler) checkResponse(resp *http.Response, page scheduledPage) bool {
} else if mt != "text/html" {
return false
}
+ if resp.ContentLength > 20*1024*1024 {
+ return false
+ }
return true
}
--
2.32.1 (Apple Git-133)