~sircmpwn/searchhut-devel

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch

[PATCH] crawler: check content length before downloading

Details
Message ID
<20220714183413.5274-1-umar@handlerug.me>
DKIM signature
missing
Download raw message
Patch: +8 -4
The limit is 20 MiB. It's also enforced through use of io.LimitedReader.

Implements: https://todo.sr.ht/~sircmpwn/searchhut/22
---
 crawler/index.go | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/crawler/index.go b/crawler/index.go
index 9ffb091..3ea7949 100644
--- a/crawler/index.go
+++ b/crawler/index.go
@@ -63,14 +63,15 @@ func (c *Crawler) Index(ctx context.Context, page scheduledPage) error {
	url = resp.Request.URL
	defer resp.Body.Close()

	hash := sha512.New()
	counter := counterWriter{}
	reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))

	if !c.checkResponse(resp, page) {
		return nil
	}

	hash := sha512.New()
	counter := counterWriter{}
	reader := io.LimitReader(resp.Body, 20*1024*1024)
	reader = io.TeeReader(reader, io.MultiWriter(&counter, hash))

	node, err := html.Parse(reader)
	if err != nil {
		return err
@@ -225,6 +226,9 @@ func (c *Crawler) checkResponse(resp *http.Response, page scheduledPage) bool {
	} else if mt != "text/html" {
		return false
	}
	if resp.ContentLength > 20*1024*1024 {
		return false
	}

	return true
}
-- 
2.32.1 (Apple Git-133)
Reply to thread Export thread (mbox)