~sircmpwn/searchhut-devel

crawler: parse sitemaps for URLs to crawl v2 PROPOSED

Umar Getagazov: 1
 crawler: parse sitemaps for URLs to crawl

 1 files changed, 191 insertions(+), 0 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~sircmpwn/searchhut-devel/patches/33885/mbox | git am -3
Learn more about email & git

[PATCH v2] crawler: parse sitemaps for URLs to crawl Export this patch

The limits are 50 MiB of uncompressed data and max 50,000 URLs in each
sitemap. Additional sitemaps can be added using robots.txt. lastMod data
is used to exclude URLs from crawling if they haven't changed since the
last crawl.

Implements: https://todo.sr.ht/~sircmpwn/searchhut/15
---
Note: this may exacerbate the "shitload of URLs" problem

v1 -> v2: fix logging and bugs in logic

 crawler/crawler.go | 191 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index ffa3678..e6ef36b 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -1,9 +1,12 @@
package crawler

import (
	"bytes"
	"context"
	"database/sql"
	"encoding/xml"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/url"
@@ -24,6 +27,11 @@ type scheduledPage struct {
	index bool
}

type sitemapURL struct {
	loc     *url.URL
	modTime time.Time
}

type Crawler struct {
	Client          *http.Client
	Domain          string
@@ -118,6 +126,99 @@ func (c *Crawler) Crawl() {
		log.Println("Not indexing from root")
	}

	sitemaps := []string{fmt.Sprintf("https://%s/sitemap.xml", c.Domain)}
	if c.Robots != nil && len(robots.Sitemaps) > 0 {
		sitemaps = robots.Sitemaps
	}

	for _, sitemap := range sitemaps {
		url, err = url.Parse(sitemap)
		if err != nil {
			continue
		}
		resp, err = c.Get(ctx, url)
		if err != nil {
			log.Printf("Error fetching sitemap %s: %v", sitemap, err)
			continue
		}
		log.Printf("Found a sitemap at %s", sitemap)

		reader := io.LimitedReader{R: resp.Body, N: 50 * 1024 * 1024}
		sitemapUrls := c.parseSitemap(&reader)
		resp.Body.Close()

		// Variables holding URLs with lastMod data that are going to be checked
		var (
			locs      []string
			urlMap    = make(map[string]*sitemapURL)
			scheduled int
		)
		for i, url := range sitemapUrls {
			if url.modTime.IsZero() {
				// Don't queue it up for the staleness check and send it
				// straight to the schedule
				c.Schedule(url.loc, false)
				scheduled += 1
				continue
			}

			locs = append(locs, url.loc.String())
			urlMap[url.loc.String()] = &sitemapUrls[i]
			if url.loc.Path == "/" && url.loc.RawQuery == "" {
				// This crawler considers root URLs to have no trailing slash,
				// so queue up a URL without a trailing slash because we may
				// have already indexed it
				sitemapURL := url
				rootURL := *url.loc
				rootURL.Path = ""
				locs = append(locs, rootURL.String())
				sitemapURL.loc = &rootURL
				urlMap[rootURL.String()] = &sitemapURL
			}
		}

		if err := database.WithTx(ctx, &sql.TxOptions{ReadOnly: true}, func(tx *sql.Tx) error {
			for len(locs) > 0 {
				batch := locs[:]
				if len(batch) > 500 {
					batch = batch[:500]
				}

				rows, err := tx.QueryContext(ctx, `
					SELECT
						x.url,
						page.last_index_date
					FROM (SELECT unnest($1::text[])) AS x(url)
					LEFT JOIN page ON page.url = x.url
				`, pq.Array(batch))
				if err != nil {
					return err
				}

				for rows.Next() {
					var pageURL string
					var lastIndexDate sql.NullTime
					if err := rows.Scan(&pageURL, &lastIndexDate); err != nil {
						return err
					}

					sitemapURL := urlMap[pageURL]
					if !lastIndexDate.Valid || sitemapURL.modTime.After(lastIndexDate.Time) {
						c.Schedule(sitemapURL.loc, false)
						scheduled += 1
					} else {
						c.Ignore(sitemapURL.loc)
					}
				}
				locs = locs[len(batch):]
			}
			return nil
		}); err != nil {
			log.Printf("Error checking sitemap URLs for staleness: %v", err)
		}
		log.Printf("Scheduled %d URLs from the sitemap", scheduled)
	}

	for len(c.schedule) != 0 {
		next := c.schedule[0]
		c.schedule = c.schedule[1:]
@@ -186,6 +287,13 @@ func (c *Crawler) Schedule(url *url.URL, index bool) {
	c.schedule = append(c.schedule, scheduledPage{&trimmed, index})
}

func (c *Crawler) Ignore(url *url.URL) {
	trimmed := *url
	trimmed.RawQuery = ""
	trimmed.Fragment = ""
	c.seen[trimmed.String()] = struct{}{}
}

func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) {
	if node.Type == html.ElementNode && node.Data == "a" {
		for _, attr := range node.Attr {
@@ -203,6 +311,68 @@ func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) {
	}
}

func (c *Crawler) parseSitemap(r io.Reader) (urls []sitemapURL) {
	// TODO: Parse <priority> and <changefreq> once crawling schedule is implemented
	var (
		decoder = xml.NewDecoder(r)
		inside  = ""

		loc     bytes.Buffer
		lastMod bytes.Buffer
	)
	for {
		tok, err := decoder.Token()
		if err != nil {
			// Either EOF or a parsing error
			break
		}
		switch tok := tok.(type) {
		case xml.StartElement:
			inside = strings.ToLower(tok.Name.Local)
		case xml.CharData:
			switch inside {
			case "loc":
				if loc.Len()+len(tok) > 2048 {
					break
				}
				loc.Write(tok)
			case "lastmod":
				if lastMod.Len()+len(tok) > 2048 {
					break
				}
				lastMod.Write(tok)
			}
		case xml.EndElement:
			if tok.Name.Local == "url" {
				locStr := strings.TrimSpace(loc.String())
				lastModStr := strings.TrimSpace(lastMod.String())
				loc.Reset()
				lastMod.Reset()

				var sitemapURL sitemapURL
				sitemapURL.loc, err = url.Parse(locStr)
				if err != nil {
					break
				}
				if sitemapURL.loc.Host != c.Domain {
					break
				}
				if lastModStr != "" {
					sitemapURL.modTime, err = parseW3CTime(lastModStr)
					if err != nil {
						sitemapURL.modTime = time.Time{}
					}
				}
				urls = append(urls, sitemapURL)
				if len(urls) > 50000 {
					return
				}
			}
		}
	}
	return
}

func getDomainLabels(domain string) []string {
	etld, _ := psl.PublicSuffix(domain)
	if len(domain) > len(etld)+1 {
@@ -227,3 +397,24 @@ func parseRegexes(patsStr []string) (pats []*regexp.Regexp) {
	}
	return
}

var timeFormats = []string{
	"2006-01-02T15:04:05.999999999Z07:00",
	"2006-01-02T15:04:05Z07:00",
	"2006-01-02T15:04Z07:00",
	"2006-01-02",
	"2006-01",
	"2006",
}

// parseW3CTime parses datetime strings according to the specification at
// https://www.w3.org/TR/NOTE-datetime.
func parseW3CTime(text string) (t time.Time, err error) {
	for _, layout := range timeFormats {
		t, err = time.Parse(layout, text)
		if err == nil {
			return
		}
	}
	return
}
-- 
2.32.1 (Apple Git-133)