
This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch

[PATCH v3] crawler: parse sitemaps for URLs to crawl

Message ID
DKIM signature
Download raw message
Patch: +206 -0
The limits are 50 MiB of uncompressed data and max 50,000 URLs in each
sitemap. Additional sitemaps can be added using robots.txt. lastMod data
is used to exclude URLs from crawling if they haven't changed since the
last crawl.

Implements: https://todo.sr.ht/~sircmpwn/searchhut/15
Note: this may exacerbate the "shitload of URLs" problem

v2 -> v3: check if the request succeeded

 crawler/crawler.go | 206 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index ffa3678..7c466cb 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -1,10 +1,14 @@
package crawler

import (
@@ -24,6 +28,11 @@ type scheduledPage struct {
	index bool

type sitemapURL struct {
	loc     *url.URL
	modTime time.Time

type Crawler struct {
	Client          *http.Client
	Domain          string
@@ -118,6 +127,113 @@ func (c *Crawler) Crawl() {
		log.Println("Not indexing from root")

	sitemaps := []string{fmt.Sprintf("https://%s/sitemap.xml", c.Domain)}
	if c.Robots != nil && len(robots.Sitemaps) > 0 {
		sitemaps = robots.Sitemaps

	for _, sitemap := range sitemaps {
		url, err = url.Parse(sitemap)
		if err != nil {
		resp, err = c.Get(ctx, url)
		if err != nil {
			log.Printf("Error fetching sitemap %s: %v", sitemap, err)

		if resp.StatusCode != http.StatusOK {
		contentType := resp.Header.Get("Content-Type")
		if contentType == "" {
		if mt, _, err := mime.ParseMediaType(contentType); err != nil {
		} else if mt != "text/xml" {

		log.Printf("Found a sitemap at %s", sitemap)

		reader := io.LimitedReader{R: resp.Body, N: 50 * 1024 * 1024}
		sitemapUrls := c.parseSitemap(&reader)

		// Variables holding URLs with lastMod data that are going to be checked
		var (
			locs      []string
			urlMap    = make(map[string]*sitemapURL)
			scheduled int
		for i, url := range sitemapUrls {
			if url.modTime.IsZero() {
				// Don't queue it up for the staleness check and send it
				// straight to the schedule
				c.Schedule(url.loc, false)
				scheduled += 1

			locs = append(locs, url.loc.String())
			urlMap[url.loc.String()] = &sitemapUrls[i]
			if url.loc.Path == "/" && url.loc.RawQuery == "" {
				// This crawler considers root URLs to have no trailing slash,
				// so queue up a URL without a trailing slash because we may
				// have already indexed it
				sitemapURL := url
				rootURL := *url.loc
				rootURL.Path = ""
				locs = append(locs, rootURL.String())
				sitemapURL.loc = &rootURL
				urlMap[rootURL.String()] = &sitemapURL

		if err := database.WithTx(ctx, &sql.TxOptions{ReadOnly: true}, func(tx *sql.Tx) error {
			for len(locs) > 0 {
				batch := locs[:]
				if len(batch) > 500 {
					batch = batch[:500]

				rows, err := tx.QueryContext(ctx, `
					FROM (SELECT unnest($1::text[])) AS x(url)
					LEFT JOIN page ON page.url = x.url
				`, pq.Array(batch))
				if err != nil {
					return err

				for rows.Next() {
					var pageURL string
					var lastIndexDate sql.NullTime
					if err := rows.Scan(&pageURL, &lastIndexDate); err != nil {
						return err

					sitemapURL := urlMap[pageURL]
					if !lastIndexDate.Valid || sitemapURL.modTime.After(lastIndexDate.Time) {
						c.Schedule(sitemapURL.loc, false)
						scheduled += 1
					} else {
				locs = locs[len(batch):]
			return nil
		}); err != nil {
			log.Printf("Error checking sitemap URLs for staleness: %v", err)
		log.Printf("Scheduled %d URLs from the sitemap", scheduled)

	for len(c.schedule) != 0 {
		next := c.schedule[0]
		c.schedule = c.schedule[1:]
@@ -186,6 +302,13 @@ func (c *Crawler) Schedule(url *url.URL, index bool) {
	c.schedule = append(c.schedule, scheduledPage{&trimmed, index})

func (c *Crawler) Ignore(url *url.URL) {
	trimmed := *url
	trimmed.RawQuery = ""
	trimmed.Fragment = ""
	c.seen[trimmed.String()] = struct{}{}

func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) {
	if node.Type == html.ElementNode && node.Data == "a" {
		for _, attr := range node.Attr {
@@ -203,6 +326,68 @@ func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) {

func (c *Crawler) parseSitemap(r io.Reader) (urls []sitemapURL) {
	// TODO: Parse <priority> and <changefreq> once crawling schedule is implemented
	var (
		decoder = xml.NewDecoder(r)
		inside  = ""

		loc     bytes.Buffer
		lastMod bytes.Buffer
	for {
		tok, err := decoder.Token()
		if err != nil {
			// Either EOF or a parsing error
		switch tok := tok.(type) {
		case xml.StartElement:
			inside = strings.ToLower(tok.Name.Local)
		case xml.CharData:
			switch inside {
			case "loc":
				if loc.Len()+len(tok) > 2048 {
			case "lastmod":
				if lastMod.Len()+len(tok) > 2048 {
		case xml.EndElement:
			if tok.Name.Local == "url" {
				locStr := strings.TrimSpace(loc.String())
				lastModStr := strings.TrimSpace(lastMod.String())

				var sitemapURL sitemapURL
				sitemapURL.loc, err = url.Parse(locStr)
				if err != nil {
				if sitemapURL.loc.Host != c.Domain {
				if lastModStr != "" {
					sitemapURL.modTime, err = parseW3CTime(lastModStr)
					if err != nil {
						sitemapURL.modTime = time.Time{}
				urls = append(urls, sitemapURL)
				if len(urls) > 50000 {

func getDomainLabels(domain string) []string {
	etld, _ := psl.PublicSuffix(domain)
	if len(domain) > len(etld)+1 {
@@ -227,3 +412,24 @@ func parseRegexes(patsStr []string) (pats []*regexp.Regexp) {

var timeFormats = []string{

// parseW3CTime parses datetime strings according to the specification at
// https://www.w3.org/TR/NOTE-datetime.
func parseW3CTime(text string) (t time.Time, err error) {
	for _, layout := range timeFormats {
		t, err = time.Parse(layout, text)
		if err == nil {
2.32.1 (Apple Git-133)
Reply to thread Export thread (mbox)