- scrapes Wikipedia urls for each city in acoli.db
---
Full url: 'https://en.wikipedia.org' + wiki_url
Note that there is an issue where some cities are not the first result
when searching for city/region/country.This is particularly true for
cities in regions whose abbreviations are common words, such as
Oregon (OR) and Indiana (IN).
This could be improved by introducing a 'regions' table and collecting
the full names of regions that are abbreviated.
numbeo/numbeo/spiders/wiki_urls.py | 54 ++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
create mode 100755 numbeo/numbeo/spiders/wiki_urls.py
diff --git a/numbeo/numbeo/spiders/wiki_urls.py b/numbeo/numbeo/spiders/wiki_urls.py
new file mode 100755
index 0000000..d30c404
--- /dev/null
+++ b/numbeo/numbeo/spiders/wiki_urls.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import scrapy
+import sqlite3
+from urllib.parse import quote
+
+con = sqlite3.connect('acoli.db')
+cur = con.cursor()
+
+#cities = cur.execute('''
+# SELECT city_id, city_name, region, country_name
+# FROM cities NATURAL JOIN countries
+# ''').fetchall()
+
+try:
+ #con = sqlite3.connect('acoli.db')
+ #cur = con.cursor()
+
+ cities = cur.execute('''
+ SELECT city_id, city_name, region, country_name
+ FROM cities NATURAL JOIN countries
+ ''').fetchall()
+except sqlite3.OperationalError:
+ #print('acoli.db does not exist')
+ cities = []
+
+url_ids = {}
+
+class wikiUrlSpider(scrapy.Spider):
+ name = 'wiki_urls'
+ allowed_domains = ['wikipedia.com/']
+ start_urls = []
+ for city in cities:
+ url = 'https://en.wikipedia.org/w/index.php?search=' + \
+ '+'.join([quote(i).replace(' ', '+') \
+ for i in city[1:] if i != ''])
+ start_urls.append(url)
+ url_ids[url] = city[0]
+
+ def parse(self, response):
+ # Check if we have been redirected to a wiki page
+ if response.url.split('/')[-2] == 'wiki':
+ wiki_url = '/wiki/' + response.url.split('/')[-1]
+ else:
+ wiki_url = response.xpath(
+ '//li[@class="mw-search-result"]//a/@href').get()
+
+ start_url = response.request.meta.get('redirect_urls')[0]
+
+
+ yield {
+ 'city_id': url_ids[start_url],
+ 'wiki_url': wiki_url,
+ }
--
2.33.0
excellent -- applied
good catch with IN or OR in the spider