Rebecca Medrano: 1 spiders: add timestamps to climate.py and qol.py 2 files changed, 21 insertions(+), 15 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26648/mbox | git am -3Learn more about email & git
- add timestamps to climate.py and qol.py output - the output of climate.py now includes rows with null values for citys without data --- numbeo/numbeo/spiders/climate.py | 28 ++++++++++++++++------------ numbeo/numbeo/spiders/qol.py | 8 +++++--- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/numbeo/numbeo/spiders/climate.py b/numbeo/numbeo/spiders/climate.py index 96af499..da8308f 100755 --- a/numbeo/numbeo/spiders/climate.py +++ b/numbeo/numbeo/spiders/climate.py @@ -3,10 +3,10 @@ # SPIDER # 3 - QUALITY OF LIFE # Use this spider AFTER numbeo_cities and numbeo_countries +import datetime import json import scrapy from scrapy.http import FormRequest -import sqlite3 with open('qoli.json', 'r') as f: city_list = json.load(f) @@ -26,22 +26,26 @@ class QOLSpider(scrapy.Spider): header = header.replace(' Climate in ', '').split(',') #enough_data = response.xpath('//div[@class="no-much-data"]/p/text()').get() #if enough_data != None and 'There are no data for' in enough_data: - con = sqlite3.connect('acoli.db') - cur = con.cursor() - city_id = cur.execute('SELECT id FROM cities WHERE city_name = "' + header[0] + '"').fetchone()[0] + city_url = response.url.split('/')[-1] # Table 2 - Climate Score By Month climate_scores = [] low_high = [] - for i in range(2,14): - climate_scores.append( - response.xpath('//table[position()=2]/tr[position()=' + - str(i) + ']/td/text()').getall()[1]) - low_high.append( - response.xpath('//table[position()=3]/tr[position()=' + - str(i) + ']/td/text()').getall()[1:]) + try: + for i in range(2,14): + climate_scores.append( + response.xpath('//table[position()=2]/tr[position()=' + + str(i) + ']/td/text()').getall()[1]) + low_high.append( + response.xpath('//table[position()=3]/tr[position()=' + + str(i) + ']/td/text()').getall()[1:]) + except: + for i in range(2,14): + climate_scores.append('') + low_high.append(['','']) yield { - 'city_id': city_id, + 'city_url': city_url, + 'climate_timestamp': datetime.datetime.now(), 'jan_score': climate_scores[0].strip(), 'jan_low': low_high[0][0].strip(), 'jan_high': low_high[0][1].strip(), diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py index e3e8d1b..1e72f91 100755 --- a/numbeo/numbeo/spiders/qol.py +++ b/numbeo/numbeo/spiders/qol.py @@ -3,6 +3,7 @@ # SPIDER # 3 - QUALITY OF LIFE # Use this spider AFTER numbeo_cities and numbeo_countries +import datetime import json import scrapy from scrapy.http import FormRequest @@ -65,6 +66,7 @@ class QOLSpider(scrapy.Spider): 'region': header[1].strip() if len(header) > 2 else '', 'country': header[2].strip() if len(header) > 2 else header[1].strip(), 'city_url': response.url.split('/')[-1], + 'qol_timestamp': datetime.datetime.now(), 'quality_of_life_index': quality_of_life_index, 'purchasing_power_index': purchasing_power_index, 'safety_index': safety_index, @@ -75,6 +77,6 @@ class QOLSpider(scrapy.Spider): 'traffic_commute_time_index': traffic_commute_time_index, 'pollution_index': pollution_index, } - else: - with open('bad_urls.txt', 'a') as f: - f.write(response.url) +# else: +# with open('bad_urls.txt', 'a') as f: +# f.write(response.url) -- 2.33.0
Andrei K. <akhartch@mail.ccsf.edu>cool, timestamps! thanks - i've applied this patch.