[PATCH] spiders: add wiki_images.py
Export this patch
- this spider scrapes image urls from the correspondin wikipedia
urls for each city
---
numbeo/numbeo/spiders/wiki_images.py | 45 ++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
create mode 100644 numbeo/numbeo/spiders/wiki_images.py
diff --git a/numbeo/numbeo/spiders/wiki_images.py b/numbeo/numbeo/spiders/wiki_images.py
new file mode 100644
index 0000000..f3c583e
--- /dev/null
+++ b/numbeo/numbeo/spiders/wiki_images.py
@@ -0,0 +1,45 @@
+import scrapy
+import sqlite3
+
+con = sqlite3.connect('acoli.db')
+cur = con.cursor()
+
+try:
+ cities = cur.execute('''
+ SELECT city_id, wiki_url
+ FROM cities
+ WHERE wiki_url IS NOT NULL
+ '''
+ ).fetchall()
+except sqlite3.OperationalError:
+ cities = []
+
+url_ids = {}
+
+
+class WikiImagesSpider(scrapy.Spider):
+ name = 'wiki_images'
+ allowed_domains = ['wikipedia.org']
+ start_urls = []
+ for city in cities:
+ if city[1]:
+ url = 'https://en.wikipedia.org' + city[1]
+ else:
+ name = cur.execute('''
+ SELECT city_name, region, country_name
+ FROM cities NATURAL JOIN COUNTRIES
+ '''
+ ).fetchone()
+ url = 'https://en.wikipedia.org/wiki/' + '_'.join(' '.join(name).split())
+ url_ids[url] = city[0]
+ start_urls.append(url)
+
+ def parse(self, response):
+ wiki_img = response.xpath(
+ '//meta[@property="og:image"]/@content'
+ ).get()
+
+ yield {
+ 'city_id': url_ids[response.url],
+ 'image_url': wiki_img,
+ }
--
2.33.0
thanks.
i'll reword the commit: correspondin -> corresponding