Umar Getagazov: 1 crawler: parse sitemaps for URLs to crawl 1 files changed, 206 insertions(+), 0 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~sircmpwn/searchhut-devel/patches/33899/mbox | git am -3Learn more about email & git
The limits are 50 MiB of uncompressed data and max 50,000 URLs in each sitemap. Additional sitemaps can be added using robots.txt. lastMod data is used to exclude URLs from crawling if they haven't changed since the last crawl. Implements: https://todo.sr.ht/~sircmpwn/searchhut/15 --- Note: this may exacerbate the "shitload of URLs" problem v2 -> v3: check if the request succeeded crawler/crawler.go | 206 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/crawler/crawler.go b/crawler/crawler.go index ffa3678..7c466cb 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -1,10 +1,14 @@ package crawler import ( + "bytes" "context" "database/sql" + "encoding/xml" "fmt" + "io" "log" + "mime" "net/http" "net/url" "regexp" @@ -24,6 +28,11 @@ type scheduledPage struct { index bool } +type sitemapURL struct { + loc *url.URL + modTime time.Time +} + type Crawler struct { Client *http.Client Domain string @@ -118,6 +127,113 @@ func (c *Crawler) Crawl() { log.Println("Not indexing from root") } + sitemaps := []string{fmt.Sprintf("https://%s/sitemap.xml", c.Domain)} + if c.Robots != nil && len(robots.Sitemaps) > 0 { + sitemaps = robots.Sitemaps + } + + for _, sitemap := range sitemaps { + url, err = url.Parse(sitemap) + if err != nil { + continue + } + resp, err = c.Get(ctx, url) + if err != nil { + log.Printf("Error fetching sitemap %s: %v", sitemap, err) + continue + } + + if resp.StatusCode != http.StatusOK { + continue + } + contentType := resp.Header.Get("Content-Type") + if contentType == "" { + continue + } + if mt, _, err := mime.ParseMediaType(contentType); err != nil { + continue + } else if mt != "text/xml" { + continue + } + + log.Printf("Found a sitemap at %s", sitemap) + + reader := io.LimitedReader{R: resp.Body, N: 50 * 1024 * 1024} + sitemapUrls := c.parseSitemap(&reader) + resp.Body.Close() + + // Variables holding URLs with lastMod data that are going to be checked + var ( + locs []string + urlMap = make(map[string]*sitemapURL) + scheduled int + ) + for i, url := range sitemapUrls { + if url.modTime.IsZero() { + // Don't queue it up for the staleness check and send it + // straight to the schedule + c.Schedule(url.loc, false) + scheduled += 1 + continue + } + + locs = append(locs, url.loc.String()) + urlMap[url.loc.String()] = &sitemapUrls[i] + if url.loc.Path == "/" && url.loc.RawQuery == "" { + // This crawler considers root URLs to have no trailing slash, + // so queue up a URL without a trailing slash because we may + // have already indexed it + sitemapURL := url + rootURL := *url.loc + rootURL.Path = "" + locs = append(locs, rootURL.String()) + sitemapURL.loc = &rootURL + urlMap[rootURL.String()] = &sitemapURL + } + } + + if err := database.WithTx(ctx, &sql.TxOptions{ReadOnly: true}, func(tx *sql.Tx) error { + for len(locs) > 0 { + batch := locs[:] + if len(batch) > 500 { + batch = batch[:500] + } + + rows, err := tx.QueryContext(ctx, ` + SELECT + x.url, + page.last_index_date + FROM (SELECT unnest($1::text[])) AS x(url) + LEFT JOIN page ON page.url = x.url + `, pq.Array(batch)) + if err != nil { + return err + } + + for rows.Next() { + var pageURL string + var lastIndexDate sql.NullTime + if err := rows.Scan(&pageURL, &lastIndexDate); err != nil { + return err + } + + sitemapURL := urlMap[pageURL] + if !lastIndexDate.Valid || sitemapURL.modTime.After(lastIndexDate.Time) { + c.Schedule(sitemapURL.loc, false) + scheduled += 1 + } else { + c.Ignore(sitemapURL.loc) + } + } + locs = locs[len(batch):] + } + return nil + }); err != nil { + log.Printf("Error checking sitemap URLs for staleness: %v", err) + } + log.Printf("Scheduled %d URLs from the sitemap", scheduled) + } + for len(c.schedule) != 0 { next := c.schedule[0] c.schedule = c.schedule[1:] @@ -186,6 +302,13 @@ func (c *Crawler) Schedule(url *url.URL, index bool) { c.schedule = append(c.schedule, scheduledPage{&trimmed, index}) } +func (c *Crawler) Ignore(url *url.URL) { + trimmed := *url + trimmed.RawQuery = "" + trimmed.Fragment = "" + c.seen[trimmed.String()] = struct{}{} +} + func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) { if node.Type == html.ElementNode && node.Data == "a" { for _, attr := range node.Attr { @@ -203,6 +326,68 @@ func (c *Crawler) ScheduleLinks(from scheduledPage, node *html.Node) { } } +func (c *Crawler) parseSitemap(r io.Reader) (urls []sitemapURL) { + // TODO: Parse <priority> and <changefreq> once crawling schedule is implemented + var ( + decoder = xml.NewDecoder(r) + inside = "" + + loc bytes.Buffer + lastMod bytes.Buffer + ) + for { + tok, err := decoder.Token() + if err != nil { + // Either EOF or a parsing error + break + } + switch tok := tok.(type) { + case xml.StartElement: + inside = strings.ToLower(tok.Name.Local) + case xml.CharData: + switch inside { + case "loc": + if loc.Len()+len(tok) > 2048 { + break + } + loc.Write(tok) + case "lastmod": + if lastMod.Len()+len(tok) > 2048 { + break + } + lastMod.Write(tok) + } + case xml.EndElement: + if tok.Name.Local == "url" { + locStr := strings.TrimSpace(loc.String()) + lastModStr := strings.TrimSpace(lastMod.String()) + loc.Reset() + lastMod.Reset() + + var sitemapURL sitemapURL + sitemapURL.loc, err = url.Parse(locStr) + if err != nil { + break + } + if sitemapURL.loc.Host != c.Domain { + break + } + if lastModStr != "" { + sitemapURL.modTime, err = parseW3CTime(lastModStr) + if err != nil { + sitemapURL.modTime = time.Time{} + } + } + urls = append(urls, sitemapURL) + if len(urls) > 50000 { + return + } + } + } + } + return +} + func getDomainLabels(domain string) []string { etld, _ := psl.PublicSuffix(domain) if len(domain) > len(etld)+1 { @@ -227,3 +412,24 @@ func parseRegexes(patsStr []string) (pats []*regexp.Regexp) { } return } + +var timeFormats = []string{ + "2006-01-02T15:04:05.999999999Z07:00", + "2006-01-02T15:04:05Z07:00", + "2006-01-02T15:04Z07:00", + "2006-01-02", + "2006-01", + "2006", +} + +// parseW3CTime parses datetime strings according to the specification at +// https://www.w3.org/TR/NOTE-datetime. +func parseW3CTime(text string) (t time.Time, err error) { + for _, layout := range timeFormats { + t, err = time.Parse(layout, text) + if err == nil { + return + } + } + return +} -- 2.32.1 (Apple Git-133)