~natpen/gus

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch

[PATCH gus] stop fetching when 5 consecutive requests failed

Details
Message ID
<20210307190051.3422-1-rwagner@rw-net.de>
DKIM signature
pass
Download raw message
Patch: +15 -3
Sometimes capsules stop working during a crawl which results
in requests waiting for a timeout. This is especially annoying
when we try to fetch a huge amount of pages from these capsules.

The setting is only in effect during the current crawl and will
start over fetching these capsules on a new crawl.
---
 gus/constants.py |  1 +
 gus/crawl.py     | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/gus/constants.py b/gus/constants.py
index 90ad2a6..d5b8a44 100644
--- a/gus/constants.py
+++ b/gus/constants.py
@@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv"
DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes

# default change frequencies (in hours)
diff --git a/gus/crawl.py b/gus/crawl.py
index bdc0c75..ecf5750 100644
--- a/gus/crawl.py
+++ b/gus/crawl.py
@@ -406,8 +406,12 @@ def crawl_page(
):
    gr = gemini_resource
    url = gr.fetchable_url
    if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
        logging.warn(
    if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
		logging.warn(
			"Too much failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url) )
		return
	if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
		logging.warn(
            "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
        )
        return
@@ -501,8 +505,12 @@ def crawl_page(
            page=page, status=0, is_different=False, timestamp=datetime.utcnow()
        )
        page_crawl.save()
		failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
		logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host])
		return

    elif response.status.startswith("4"):
	failure_count[gr.normalized_host] = 0
	if response.status.startswith("4"):
        # temporary error status
        logging.debug(
            "Got temporary error: %s: %s %s",
@@ -790,6 +798,9 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
    global max_crawl_depth
    max_crawl_depth = 100

	global failure_count
	failure_count = {}

    expired_resources = [GeminiResource(url) for url in load_expired_urls()]
    for resource in expired_resources:
        crawl_page(resource, 0, should_check_if_expired=False)
-- 
2.30.1
Reply to thread Export thread (mbox)