~akspecs/numbeo-scraping-dev

spiders: add timestamps to climate.py and qol.py v1 APPLIED

Rebecca Medrano: 1
 spiders: add timestamps to climate.py and qol.py

 2 files changed, 21 insertions(+), 15 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26648/mbox | git am -3
Learn more about email & git

[PATCH] spiders: add timestamps to climate.py and qol.py Export this patch

- add timestamps to climate.py and qol.py output
- the output of climate.py now includes rows with null values for citys
  without data
---
 numbeo/numbeo/spiders/climate.py | 28 ++++++++++++++++------------
 numbeo/numbeo/spiders/qol.py     |  8 +++++---
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/numbeo/numbeo/spiders/climate.py b/numbeo/numbeo/spiders/climate.py
index 96af499..da8308f 100755
--- a/numbeo/numbeo/spiders/climate.py
+++ b/numbeo/numbeo/spiders/climate.py
@@ -3,10 +3,10 @@
# SPIDER # 3 - QUALITY OF LIFE
# Use this spider AFTER numbeo_cities and numbeo_countries

import datetime
import json
import scrapy
from scrapy.http import FormRequest
import sqlite3

with open('qoli.json', 'r') as f:
    city_list = json.load(f)
@@ -26,22 +26,26 @@ class QOLSpider(scrapy.Spider):
          header = header.replace(' Climate in ', '').split(',')
          #enough_data = response.xpath('//div[@class="no-much-data"]/p/text()').get()
          #if enough_data != None and 'There are no  data for' in enough_data:
          con = sqlite3.connect('acoli.db')
          cur = con.cursor()
          city_id = cur.execute('SELECT id FROM cities WHERE city_name = "' + header[0] + '"').fetchone()[0]
          city_url = response.url.split('/')[-1]
          # Table 2 - Climate Score By Month
          climate_scores = []
          low_high = []
          for i in range(2,14):
              climate_scores.append(
                      response.xpath('//table[position()=2]/tr[position()=' +
                          str(i) + ']/td/text()').getall()[1])
              low_high.append(
                      response.xpath('//table[position()=3]/tr[position()=' +
                          str(i) + ']/td/text()').getall()[1:])
          try:
              for i in range(2,14):
                  climate_scores.append(
                          response.xpath('//table[position()=2]/tr[position()=' +
                              str(i) + ']/td/text()').getall()[1])
                  low_high.append(
                          response.xpath('//table[position()=3]/tr[position()=' +
                              str(i) + ']/td/text()').getall()[1:])
          except:
              for i in range(2,14):
                  climate_scores.append('')
                  low_high.append(['',''])

          yield {
              'city_id': city_id,
              'city_url': city_url,
              'climate_timestamp': datetime.datetime.now(),
              'jan_score': climate_scores[0].strip(),
              'jan_low': low_high[0][0].strip(),
              'jan_high': low_high[0][1].strip(),
diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py
index e3e8d1b..1e72f91 100755
--- a/numbeo/numbeo/spiders/qol.py
+++ b/numbeo/numbeo/spiders/qol.py
@@ -3,6 +3,7 @@
# SPIDER # 3 - QUALITY OF LIFE
# Use this spider AFTER numbeo_cities and numbeo_countries

import datetime
import json
import scrapy
from scrapy.http import FormRequest
@@ -65,6 +66,7 @@ class QOLSpider(scrapy.Spider):
              'region': header[1].strip() if len(header) > 2 else '',
              'country': header[2].strip() if len(header) > 2 else header[1].strip(),
              'city_url': response.url.split('/')[-1],
              'qol_timestamp': datetime.datetime.now(),
              'quality_of_life_index': quality_of_life_index,
              'purchasing_power_index': purchasing_power_index,
              'safety_index': safety_index,
@@ -75,6 +77,6 @@ class QOLSpider(scrapy.Spider):
              'traffic_commute_time_index': traffic_commute_time_index,
              'pollution_index': pollution_index,
          }
        else:
            with open('bad_urls.txt', 'a') as f:
                f.write(response.url)
#        else:
#            with open('bad_urls.txt', 'a') as f:
#                f.write(response.url)
-- 
2.33.0
cool, timestamps!
thanks - i've applied this patch.