~sircmpwn/searchhut-devel

crawler: check content length before downloading v1 PROPOSED

Umar Getagazov: 1
 crawler: check content length before downloading

 1 files changed, 8 insertions(+), 4 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/searchhut-devel/patches/33886/mbox | git am -3
Learn more about email & git

[PATCH] crawler: check content length before downloading Export this patch

The limit is 20 MiB. It's also enforced through use of io.LimitedReader.

Implements: https://todo.sr.ht/~sircmpwn/searchhut/22
---
 crawler/index.go | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/crawler/index.go b/crawler/index.go
index 9ffb091..3ea7949 100644
--- a/crawler/index.go
+++ b/crawler/index.go
@@ -63,14 +63,15 @@ func (c *Crawler) Index(ctx context.Context, page scheduledPage) error {
	url = resp.Request.URL
	defer resp.Body.Close()

	hash := sha512.New()
	counter := counterWriter{}
	reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))

	if !c.checkResponse(resp, page) {
		return nil
	}

	hash := sha512.New()
	counter := counterWriter{}
	reader := io.LimitReader(resp.Body, 20*1024*1024)
	reader = io.TeeReader(reader, io.MultiWriter(&counter, hash))

	node, err := html.Parse(reader)
	if err != nil {
		return err
@@ -225,6 +226,9 @@ func (c *Crawler) checkResponse(resp *http.Response, page scheduledPage) bool {
	} else if mt != "text/html" {
		return false
	}
	if resp.ContentLength > 20*1024*1024 {
		return false
	}

	return true
}
-- 
2.32.1 (Apple Git-133)