Rebecca Medrano: 4 spiders: add number of contributors to qol.py json2sqlite.py: update table columns spiders: scrape location data from wikipedia spiders: update wiki_data location scraping 10 files changed, 173 insertions(+), 108 deletions(-)
alright! i've applied all 4 commits.
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26764/mbox | git am -3Learn more about email & git
- add minimum and maximum number of contributors from Numbeo Quality of Life page to qol spider - update json2sqlite.py to replace '?' with NULL when inserting into quality_of_life table --- numbeo/json2sqlite.py | 3 +++ numbeo/numbeo/spiders/qol.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py index a76a107..09130d1 100755 --- a/numbeo/json2sqlite.py +++ b/numbeo/json2sqlite.py @@ -130,6 +130,9 @@ cur.execute(''' with open('qoli.json') as f: cities = json.load(f) for city in cities: + for key in city.keys(): + if city[key] == '' or city[key] == '?': + city[key] = None city_id = cur.execute(f''' SELECT city_id FROM cities diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py index 1e72f91..24b161f 100755 --- a/numbeo/numbeo/spiders/qol.py +++ b/numbeo/numbeo/spiders/qol.py @@ -41,6 +41,8 @@ class QOLSpider(scrapy.Spider): property_price_to_income_ratio = '?' traffic_commute_time_index = '?' pollution_index = '?' + max_contributors = '' + min_contributors = '' else: quality_of_life_index = response.xpath( '//div/table/tr[contains(., "Quality of Life Index")]/td//text()').getall()[2].strip() @@ -61,6 +63,13 @@ class QOLSpider(scrapy.Spider): pollution_index = response.xpath( '//div/table/tr[contains(., "Pollution Index")]/td//text()').getall()[2] + max_contributors = response.xpath( + '//span[@class="reportees"]/text()[contains(., "Maximum")]') \ + .get().split(':')[-1].strip() + min_contributors = response.xpath( + '//span[@class="reportees"]/text()[contains(., "Minimum")]') \ + .get().split(':')[-1].strip() + yield { 'city': header[0].strip(), 'region': header[1].strip() if len(header) > 2 else '', @@ -76,6 +85,8 @@ class QOLSpider(scrapy.Spider): 'property_price_to_income_ratio': property_price_to_income_ratio, 'traffic_commute_time_index': traffic_commute_time_index, 'pollution_index': pollution_index, + 'max_contributors': max_contributors, + 'min_contributors': min_contributors, } # else: # with open('bad_urls.txt', 'a') as f: -- 2.34.0
- add contributors to quality_of_life and timeseries tables - give timestamp columns unique names to facilitate natural joins - spiders: qol.py and climate.py now import datetime.datetime instead of entire datetime module --- numbeo/json2sqlite.py | 66 ++++++++++++++++---------------- numbeo/numbeo/spiders/climate.py | 5 +-- numbeo/numbeo/spiders/qol.py | 5 +-- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py index 09130d1..d2e80a0 100755 --- a/numbeo/json2sqlite.py +++ b/numbeo/json2sqlite.py @@ -45,7 +45,8 @@ cur.execute(''' country_id INTEGER NOT NULL, city_url VARCHAR(200) NOT NULL UNIQUE, wiki_url VARCHAR(200), - FOREIGN KEY (country_id) REFERENCES countries(id) + FOREIGN KEY (country_id) REFERENCES countries(id), + CONSTRAINT unique_name UNIQUE (city_name, region, country_id) ) ''') @@ -87,7 +88,7 @@ with open('wiki_urls.json') as f: # This table contains the most recent QOL data for each city cur.execute(''' CREATE TABLE IF NOT EXISTS quality_of_life ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + qol_id INTEGER PRIMARY KEY AUTOINCREMENT, city_id INTEGER NOT NULL UNIQUE, qol_timestamp TIMESTAMP, quality_of_life_index DECIMAL(5, 2), @@ -99,15 +100,17 @@ cur.execute(''' property_price_to_income_ratio DECIMAL(5, 2), traffic_commute_time_index DECIMAL(5, 2), pollution_index DECIMAL(5, 2), - FOREIGN KEY (city_id) REFERENCES cities(id) + min_contributors INTEGER, + max_contributors INTEGER, + FOREIGN KEY (city_id) REFERENCES cities(city_id) ) ''') # Create time_series_quality_of_life table # This table contains time series QOL data for each city cur.execute(''' - CREATE TABLE IF NOT EXISTS quality_of_life ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + CREATE TABLE IF NOT EXISTS time_series_quality_of_life ( + qol_ts_id INTEGER PRIMARY KEY AUTOINCREMENT, city_id INTEGER NOT NULL, qol_timestamp TIMESTAMP, quality_of_life_index DECIMAL(5, 2), @@ -118,8 +121,10 @@ cur.execute(''' cost_of_living_index DECIMAL(5, 2), property_price_to_income_ratio DECIMAL(5, 2), traffic_commute_time_index DECIMAL(5, 2), + min_contributors INTEGER, + max_contributors INTEGER, pollution_index DECIMAL(5, 2), - FOREIGN KEY (city_id) REFERENCES cities(id), + FOREIGN KEY (city_id) REFERENCES cities(city_id), CONSTRAINT unique_record UNIQUE (city_id, qol_timestamp) ) ''') @@ -145,7 +150,7 @@ with open('qoli.json') as f: ''').fetchone() # Check if there is an existing record # if so, compare timestamps - if not ts or ts[0] < city['qol_timestamp']: + if not ts or ts[0] < city['timestamp']: # Insert into quality_of_life table or replace existing record cur.execute(''' REPLACE INTO quality_of_life ( @@ -159,11 +164,13 @@ with open('qoli.json') as f: cost_of_living_index, property_price_to_income_ratio, traffic_commute_time_index, - pollution_index + pollution_index, + min_contributors, + max_contributors ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', (city_id, - city['qol_timestamp'], + city['timestamp'], city['quality_of_life_index'], city['purchasing_power_index'], city['safety_index'], @@ -172,11 +179,13 @@ with open('qoli.json') as f: city['cost_of_living_index'], city['property_price_to_income_ratio'], city['traffic_commute_time_index'], - city['pollution_index'],) + city['pollution_index'], + city['min_contributors'], + city['max_contributors'],) ) # Insert into time series table or ignore if record exists cur.execute(''' - INSERT OR IGNORE INTO quality_of_life ( + INSERT OR IGNORE INTO time_series_quality_of_life ( city_id, qol_timestamp, quality_of_life_index, @@ -187,11 +196,13 @@ with open('qoli.json') as f: cost_of_living_index, property_price_to_income_ratio, traffic_commute_time_index, - pollution_index + pollution_index, + min_contributors, + max_contributors ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', (city_id, - city['qol_timestamp'], + city['timestamp'], city['quality_of_life_index'], city['purchasing_power_index'], city['safety_index'], @@ -200,14 +211,16 @@ with open('qoli.json') as f: city['cost_of_living_index'], city['property_price_to_income_ratio'], city['traffic_commute_time_index'], - city['pollution_index'],) + city['pollution_index'], + city['min_contributors'], + city['max_contributors'],) ) # Create climate table cur.execute(''' CREATE TABLE IF NOT EXISTS climate ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + climate_id INTEGER PRIMARY KEY AUTOINCREMENT, city_id INTEGER NOT NULL UNIQUE, climate_timestamp TIMESTAMP, jan_score DECIMAL(5, 2), feb_score DECIMAL(5, 2), @@ -228,7 +241,7 @@ cur.execute(''' oct_low VARCHAR(10), oct_high VARCHAR(10), nov_low VARCHAR(10), nov_high VARCHAR(10), dec_low VARCHAR(10), dec_high VARCHAR(10), - FOREIGN KEY (city_id) REFERENCES cities(id) + FOREIGN KEY (city_id) REFERENCES cities(city_id) ) ''') @@ -265,7 +278,7 @@ with open('climate.json') as f: ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ''', (city_id, city['climate_timestamp'], + ''', (city_id, city['timestamp'], city['jan_score'], city['feb_score'], city['mar_score'], city['apr_score'], city['may_score'], city['jun_score'], @@ -291,7 +304,8 @@ cur.execute(''' CREATE TABLE IF NOT EXISTS image_urls ( id INTEGER PRIMARY KEY AUTOINCREMENT, city_id INTEGER NOT NULL UNIQUE, - image_url VARCHAR(500) + image_url VARCHAR(500), + FOREIGN KEY (city_id) REFERENCES cities(city_id) ) ''') @@ -307,17 +321,5 @@ with open('wiki_images.json') as f: ) -# uncomment to print data being added to db -# consider implementing a flag that offers verbose output of the following -#for row in cur.execute(''' -# SELECT * -# FROM quality_of_life -# INNER JOIN cities ON city_id = cities.id -# INNER JOIN countries -# ON country_id = countries.id -# WHERE cities.region = 'CA' -# '''): -# print(row) - con.commit() con.close() diff --git a/numbeo/numbeo/spiders/climate.py b/numbeo/numbeo/spiders/climate.py index 19b65d6..ada59bb 100755 --- a/numbeo/numbeo/spiders/climate.py +++ b/numbeo/numbeo/spiders/climate.py @@ -1,9 +1,8 @@ #!/usr/bin/env python3 -import datetime +from datetime import datetime import json import scrapy -from scrapy.http import FormRequest with open('qoli.json', 'r') as f: city_list = json.load(f) @@ -42,7 +41,7 @@ class ClimateSpider(scrapy.Spider): yield { 'city_url': city_url, - 'climate_timestamp': datetime.datetime.now(), + 'timestamp': datetime.now(), 'jan_score': climate_scores[0].strip(), 'jan_low': low_high[0][0].strip(), 'jan_high': low_high[0][1].strip(), diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py index 24b161f..0f44586 100755 --- a/numbeo/numbeo/spiders/qol.py +++ b/numbeo/numbeo/spiders/qol.py @@ -3,10 +3,9 @@ # SPIDER # 3 - QUALITY OF LIFE # Use this spider AFTER numbeo_cities and numbeo_countries -import datetime +from datetime import datetime import json import scrapy -from scrapy.http import FormRequest with open('cities.json', 'r') as f: city_list = json.load(f) @@ -75,7 +74,7 @@ class QOLSpider(scrapy.Spider): 'region': header[1].strip() if len(header) > 2 else '', 'country': header[2].strip() if len(header) > 2 else header[1].strip(), 'city_url': response.url.split('/')[-1], - 'qol_timestamp': datetime.datetime.now(), + 'timestamp': datetime.now(), 'quality_of_life_index': quality_of_life_index, 'purchasing_power_index': purchasing_power_index, 'safety_index': safety_index, -- 2.34.0
- rename wiki_images.py to wiki_data.py and scrape location coordinates - update json2sqlite.py to include latitude and longitude columns in 'cities' table --- numbeo/json2sqlite.py | 15 +++++- numbeo/numbeo/spiders/wiki_data.py | 68 ++++++++++++++++++++++++++++ numbeo/numbeo/spiders/wiki_images.py | 45 ------------------ numbeo/numbeo/spiders/wiki_urls.py | 1 - 4 files changed, 82 insertions(+), 47 deletions(-) create mode 100644 numbeo/numbeo/spiders/wiki_data.py delete mode 100644 numbeo/numbeo/spiders/wiki_images.py diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py index d2e80a0..bd79961 100755 --- a/numbeo/json2sqlite.py +++ b/numbeo/json2sqlite.py @@ -43,6 +43,8 @@ cur.execute(''' city_name VARCHAR(50) NOT NULL, region VARCHAR(50), country_id INTEGER NOT NULL, + latitude DECIMAL(3, 6), + longitude DECIMAL(3, 6), city_url VARCHAR(200) NOT NULL UNIQUE, wiki_url VARCHAR(200), FOREIGN KEY (country_id) REFERENCES countries(id), @@ -82,6 +84,17 @@ with open('wiki_urls.json') as f: WHERE city_id = "{city['city_id']}" ''' ) + # Add coordinates to cities table from wiki_urls.json +with open('wiki_data.json') as f: + cities = json.load(f) + for city in cities: + cur.execute(f''' + UPDATE cities + SET latitude = {city['latitude']}, + longitude = {city['longitude']} + WHERE city_id = "{city['city_id']}" + ''' + ) # Create quality_of_life table @@ -310,7 +323,7 @@ cur.execute(''' ''') # Fill image_urls table from images.json -with open('wiki_images.json') as f: +with open('wiki_data.json') as f: cities = json.load(f) for city in cities: cur.execute(''' diff --git a/numbeo/numbeo/spiders/wiki_data.py b/numbeo/numbeo/spiders/wiki_data.py new file mode 100644 index 0000000..699bb99 --- /dev/null +++ b/numbeo/numbeo/spiders/wiki_data.py @@ -0,0 +1,68 @@ +import scrapy +import sqlite3 + +con = sqlite3.connect('acoli.db') +cur = con.cursor() + +try: + cities = cur.execute(''' + SELECT city_id, wiki_url + FROM cities + WHERE wiki_url IS NOT NULL + ''' + ).fetchall() +except sqlite3.OperationalError: + cities = [] + +url_ids = {} + + +class WikiDataSpider(scrapy.Spider): + name = 'wiki_data' + allowed_domains = ['wikipedia.org', 'geohack.toolforge.org'] + start_urls = [] + for city in cities: + if city[1]: + url = 'https://en.wikipedia.org' + city[1] + else: + name = cur.execute(''' + SELECT city_name, region, country_name + FROM cities NATURAL JOIN COUNTRIES + ''' + ).fetchone() + url = 'https://en.wikipedia.org/wiki/' + '_'.join(' '.join(name).split()) + url_ids[url] = city[0] + start_urls.append(url) + + def parse(self, response): + wiki_img = response.xpath( + '//meta[@property="og:image"]/@content' + ).get() + + geo_url = 'https:' + response.xpath( + '//a/@href[contains(., "geohack")]').get() + + if geo_url: + request = scrapy.Request(url=geo_url, callback=self.parse_geo) + request.meta['city_id'] = url_ids[response.url] + request.meta['image_url'] = wiki_img + yield request + else: + yield { + 'city_id': url_ids[response.url], + 'image_url': wiki_img, + 'latitude': '', + 'longitude': '', + } + + def parse_geo(self, response): + latitude = response.xpath('//span[@class="latitude p-latitude"]/text()' + ).get() + longitude = response.xpath('//span[@class="longitude p-longitude"]/text()' + ).get() + yield { + 'city_id': response.meta['city_id'], + 'image_url': response.meta['image_url'], + 'latitude': latitude, + 'longitude': longitude, + } diff --git a/numbeo/numbeo/spiders/wiki_images.py b/numbeo/numbeo/spiders/wiki_images.py deleted file mode 100644 index f3c583e..0000000 --- a/numbeo/numbeo/spiders/wiki_images.py @@ -1,45 +0,0 @@ -import scrapy -import sqlite3 - -con = sqlite3.connect('acoli.db') -cur = con.cursor() - -try: - cities = cur.execute(''' - SELECT city_id, wiki_url - FROM cities - WHERE wiki_url IS NOT NULL - ''' - ).fetchall() -except sqlite3.OperationalError: - cities = [] - -url_ids = {} - - -class WikiImagesSpider(scrapy.Spider): - name = 'wiki_images' - allowed_domains = ['wikipedia.org'] - start_urls = [] - for city in cities: - if city[1]: - url = 'https://en.wikipedia.org' + city[1] - else: - name = cur.execute(''' - SELECT city_name, region, country_name - FROM cities NATURAL JOIN COUNTRIES - ''' - ).fetchone() - url = 'https://en.wikipedia.org/wiki/' + '_'.join(' '.join(name).split()) - url_ids[url] = city[0] - start_urls.append(url) - - def parse(self, response): - wiki_img = response.xpath( - '//meta[@property="og:image"]/@content' - ).get() - - yield { - 'city_id': url_ids[response.url], - 'image_url': wiki_img, - } diff --git a/numbeo/numbeo/spiders/wiki_urls.py b/numbeo/numbeo/spiders/wiki_urls.py index d30c404..9d74ea4 100755 --- a/numbeo/numbeo/spiders/wiki_urls.py +++ b/numbeo/numbeo/spiders/wiki_urls.py @@ -47,7 +47,6 @@ class wikiUrlSpider(scrapy.Spider): start_url = response.request.meta.get('redirect_urls')[0] - yield { 'city_id': url_ids[start_url], 'wiki_url': wiki_url, -- 2.34.0
- get coordinates directly from start_urls; this fixes an issue where data was getting mixed up when scraping concurrently --- numbeo/numbeo/spiders/wiki_data.py | 62 +++++++++++++++++++----------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/numbeo/numbeo/spiders/wiki_data.py b/numbeo/numbeo/spiders/wiki_data.py index 699bb99..a4d19f7 100644 --- a/numbeo/numbeo/spiders/wiki_data.py +++ b/numbeo/numbeo/spiders/wiki_data.py @@ -17,9 +17,20 @@ except sqlite3.OperationalError: url_ids = {} +# Convert coordinates in degree/minute/second form to decimal +def toDecimalCoordinate(coordinate): + coordinate = coordinate.replace('′', '°').replace('″', '°').split('°') + decimal = int(coordinate[0]) + \ + int(coordinate[1]) / 60 + \ + int(coordinate[2]) / 3600 + if coordinate[3] in ['W','S']: + decimal = -decimal + return decimal + + class WikiDataSpider(scrapy.Spider): name = 'wiki_data' - allowed_domains = ['wikipedia.org', 'geohack.toolforge.org'] + allowed_domains = ['wikipedia.org'] start_urls = [] for city in cities: if city[1]: @@ -34,35 +45,40 @@ class WikiDataSpider(scrapy.Spider): url_ids[url] = city[0] start_urls.append(url) + def toDecimalCoordinate(coordinate): + coordinate = re.split('°′″') + decimal = coordinate[0] + \ + coordinate[1] / 60 + \ + coordinate[2] / 3600 + if coordinate[3] in ['W','S']: + decimal = -decimal + return decimal + def parse(self, response): wiki_img = response.xpath( '//meta[@property="og:image"]/@content' ).get() + + latitude = toDecimalCoordinate( + response.xpath('//span[@class="latitude"]/text()').get() + ) - geo_url = 'https:' + response.xpath( - '//a/@href[contains(., "geohack")]').get() + longitude = toDecimalCoordinate( + response.xpath('//span[@class="longitude"]/text()').get() + ) - if geo_url: - request = scrapy.Request(url=geo_url, callback=self.parse_geo) - request.meta['city_id'] = url_ids[response.url] - request.meta['image_url'] = wiki_img - yield request - else: - yield { - 'city_id': url_ids[response.url], - 'image_url': wiki_img, - 'latitude': '', - 'longitude': '', - } - - def parse_geo(self, response): - latitude = response.xpath('//span[@class="latitude p-latitude"]/text()' - ).get() - longitude = response.xpath('//span[@class="longitude p-longitude"]/text()' - ).get() yield { - 'city_id': response.meta['city_id'], - 'image_url': response.meta['image_url'], + 'city_id': url_ids[response.url], + 'image_url': wiki_img, 'latitude': latitude, 'longitude': longitude, } + + def toDecimalCoordinate(coordinate): + coordinate = re.split('°′″') + decimal = coordinate[0] + \ + coordinate[1] / 60 + \ + coordinate[2] / 3600 + if coordinate[3] in ['W','S']: + decimal = -decimal + return decimal -- 2.34.0
Andrei K. <akhartch@mail.ccsf.edu>alright! i've applied all 4 commits.