~akspecs/numbeo-scraping-dev

spiders: add number of contributors to qol.py v1 APPLIED

Rebecca Medrano: 4
 spiders: add number of contributors to qol.py
 json2sqlite.py: update table columns
 spiders: scrape location data from wikipedia
 spiders: update wiki_data location scraping

 10 files changed, 173 insertions(+), 108 deletions(-)
alright!  i've applied all 4 commits.
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26764/mbox | git am -3
Learn more about email & git

[PATCH 1/4] spiders: add number of contributors to qol.py Export this patch

 - add minimum and maximum number of contributors from Numbeo
   Quality of Life page to qol spider
-  update json2sqlite.py to replace '?' with NULL when inserting
   into quality_of_life table
---
 numbeo/json2sqlite.py        |  3 +++
 numbeo/numbeo/spiders/qol.py | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py
index a76a107..09130d1 100755
--- a/numbeo/json2sqlite.py
+++ b/numbeo/json2sqlite.py
@@ -130,6 +130,9 @@ cur.execute('''
with open('qoli.json') as f:
    cities = json.load(f)
    for city in cities:
        for key in city.keys():
            if city[key] == '' or city[key] == '?':
                city[key] = None
        city_id = cur.execute(f'''
                              SELECT city_id
                              FROM cities
diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py
index 1e72f91..24b161f 100755
--- a/numbeo/numbeo/spiders/qol.py
+++ b/numbeo/numbeo/spiders/qol.py
@@ -41,6 +41,8 @@ class QOLSpider(scrapy.Spider):
              property_price_to_income_ratio = '?'
              traffic_commute_time_index = '?'
              pollution_index = '?'
              max_contributors = ''
              min_contributors = ''
          else:
            quality_of_life_index = response.xpath(
                '//div/table/tr[contains(., "Quality of Life Index")]/td//text()').getall()[2].strip()
@@ -61,6 +63,13 @@ class QOLSpider(scrapy.Spider):
            pollution_index = response.xpath(
                '//div/table/tr[contains(., "Pollution Index")]/td//text()').getall()[2]

            max_contributors = response.xpath(
                '//span[@class="reportees"]/text()[contains(., "Maximum")]') \
                .get().split(':')[-1].strip()
            min_contributors = response.xpath(
                '//span[@class="reportees"]/text()[contains(., "Minimum")]') \
                .get().split(':')[-1].strip()

          yield {
              'city': header[0].strip(),
              'region': header[1].strip() if len(header) > 2 else '',
@@ -76,6 +85,8 @@ class QOLSpider(scrapy.Spider):
              'property_price_to_income_ratio': property_price_to_income_ratio,
              'traffic_commute_time_index': traffic_commute_time_index,
              'pollution_index': pollution_index,
              'max_contributors': max_contributors,
              'min_contributors': min_contributors,
          }
#        else:
#            with open('bad_urls.txt', 'a') as f:
-- 
2.34.0

[PATCH 2/4] json2sqlite.py: update table columns Export this patch

 - add contributors to quality_of_life and timeseries tables

 - give timestamp columns unique names to facilitate natural joins

 - spiders: qol.py and climate.py now import datetime.datetime
   instead of entire datetime module
---
 numbeo/json2sqlite.py            | 66 ++++++++++++++++----------------
 numbeo/numbeo/spiders/climate.py |  5 +--
 numbeo/numbeo/spiders/qol.py     |  5 +--
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py
index 09130d1..d2e80a0 100755
--- a/numbeo/json2sqlite.py
+++ b/numbeo/json2sqlite.py
@@ -45,7 +45,8 @@ cur.execute('''
                country_id INTEGER NOT NULL,
                city_url VARCHAR(200) NOT NULL UNIQUE,
                wiki_url VARCHAR(200),
                FOREIGN KEY (country_id) REFERENCES countries(id)
                FOREIGN KEY (country_id) REFERENCES countries(id),
                CONSTRAINT unique_name UNIQUE (city_name, region, country_id)
            )
            ''')

@@ -87,7 +88,7 @@ with open('wiki_urls.json') as f:
# This table contains the most recent QOL data for each city
cur.execute('''
            CREATE TABLE IF NOT EXISTS quality_of_life (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                qol_id INTEGER PRIMARY KEY AUTOINCREMENT,
                city_id INTEGER NOT NULL UNIQUE,
                qol_timestamp TIMESTAMP,
                quality_of_life_index DECIMAL(5, 2),
@@ -99,15 +100,17 @@ cur.execute('''
                property_price_to_income_ratio DECIMAL(5, 2),
                traffic_commute_time_index DECIMAL(5, 2),
                pollution_index DECIMAL(5, 2),
                FOREIGN KEY (city_id) REFERENCES cities(id)
                min_contributors INTEGER,
                max_contributors INTEGER,
                FOREIGN KEY (city_id) REFERENCES cities(city_id)
            )
            ''')

# Create time_series_quality_of_life table
# This table contains time series QOL data for each city
cur.execute('''
            CREATE TABLE IF NOT EXISTS quality_of_life (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
            CREATE TABLE IF NOT EXISTS time_series_quality_of_life (
                qol_ts_id INTEGER PRIMARY KEY AUTOINCREMENT,
                city_id INTEGER NOT NULL,
                qol_timestamp TIMESTAMP,
                quality_of_life_index DECIMAL(5, 2),
@@ -118,8 +121,10 @@ cur.execute('''
                cost_of_living_index DECIMAL(5, 2),
                property_price_to_income_ratio DECIMAL(5, 2),
                traffic_commute_time_index DECIMAL(5, 2),
                min_contributors INTEGER,
                max_contributors INTEGER,
                pollution_index DECIMAL(5, 2),
                FOREIGN KEY (city_id) REFERENCES cities(id),
                FOREIGN KEY (city_id) REFERENCES cities(city_id),
                CONSTRAINT unique_record UNIQUE (city_id, qol_timestamp)
            )
            ''')
@@ -145,7 +150,7 @@ with open('qoli.json') as f:
                         ''').fetchone()
        # Check if there is an existing record
        # if so, compare timestamps
        if not ts or ts[0] < city['qol_timestamp']:
        if not ts or ts[0] < city['timestamp']:
            # Insert into quality_of_life table or replace existing record
            cur.execute('''
                        REPLACE INTO quality_of_life (
@@ -159,11 +164,13 @@ with open('qoli.json') as f:
                            cost_of_living_index,
                            property_price_to_income_ratio,
                            traffic_commute_time_index,
                            pollution_index
                            pollution_index,
                            min_contributors,
                            max_contributors
                        )
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        ''', (city_id,
                              city['qol_timestamp'],
                              city['timestamp'],
                              city['quality_of_life_index'],
                              city['purchasing_power_index'],
                              city['safety_index'],
@@ -172,11 +179,13 @@ with open('qoli.json') as f:
                              city['cost_of_living_index'],
                              city['property_price_to_income_ratio'],
                              city['traffic_commute_time_index'],
                              city['pollution_index'],)
                              city['pollution_index'],
                              city['min_contributors'],
                              city['max_contributors'],)
            )
        # Insert into time series table or ignore if record exists
        cur.execute('''
                    INSERT OR IGNORE INTO quality_of_life (
                    INSERT OR IGNORE INTO time_series_quality_of_life (
                        city_id,
                        qol_timestamp,
                        quality_of_life_index,
@@ -187,11 +196,13 @@ with open('qoli.json') as f:
                        cost_of_living_index,
                        property_price_to_income_ratio,
                        traffic_commute_time_index,
                        pollution_index
                        pollution_index,
                        min_contributors,
                        max_contributors
                    )
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    ''', (city_id,
                          city['qol_timestamp'],
                          city['timestamp'],
                          city['quality_of_life_index'],
                          city['purchasing_power_index'],
                          city['safety_index'],
@@ -200,14 +211,16 @@ with open('qoli.json') as f:
                          city['cost_of_living_index'],
                          city['property_price_to_income_ratio'],
                          city['traffic_commute_time_index'],
                          city['pollution_index'],)
                          city['pollution_index'],
                          city['min_contributors'],
                          city['max_contributors'],)
        )


# Create climate table
cur.execute('''
            CREATE TABLE IF NOT EXISTS climate (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                climate_id INTEGER PRIMARY KEY AUTOINCREMENT,
                city_id INTEGER NOT NULL UNIQUE,
                climate_timestamp TIMESTAMP,
                jan_score DECIMAL(5, 2), feb_score DECIMAL(5, 2),
@@ -228,7 +241,7 @@ cur.execute('''
                oct_low VARCHAR(10), oct_high VARCHAR(10),
                nov_low VARCHAR(10), nov_high VARCHAR(10),
                dec_low VARCHAR(10), dec_high VARCHAR(10),
                FOREIGN KEY (city_id) REFERENCES cities(id)
                FOREIGN KEY (city_id) REFERENCES cities(city_id)
            )
            ''')

@@ -265,7 +278,7 @@ with open('climate.json') as f:
                            ?, ?, ?, ?, ?, ?, ?, ?, ?,
                            ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
                            ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    ''', (city_id, city['climate_timestamp'],
                    ''', (city_id, city['timestamp'],
                          city['jan_score'], city['feb_score'],
                          city['mar_score'], city['apr_score'],
                          city['may_score'], city['jun_score'],
@@ -291,7 +304,8 @@ cur.execute('''
            CREATE TABLE IF NOT EXISTS image_urls (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            city_id INTEGER NOT NULL UNIQUE,
            image_url VARCHAR(500)
            image_url VARCHAR(500),
            FOREIGN KEY (city_id) REFERENCES cities(city_id)
            )
            ''')

@@ -307,17 +321,5 @@ with open('wiki_images.json') as f:
        )


# uncomment to print data being added to db
# consider implementing a flag that offers verbose output of the following
#for row in cur.execute('''
#                       SELECT *
#                       FROM quality_of_life
#                           INNER JOIN cities ON city_id = cities.id
#                               INNER JOIN countries
#                               ON country_id = countries.id
#                       WHERE cities.region = 'CA'
#                       '''):
#    print(row)

con.commit()
con.close()
diff --git a/numbeo/numbeo/spiders/climate.py b/numbeo/numbeo/spiders/climate.py
index 19b65d6..ada59bb 100755
--- a/numbeo/numbeo/spiders/climate.py
+++ b/numbeo/numbeo/spiders/climate.py
@@ -1,9 +1,8 @@
#!/usr/bin/env python3

import datetime
from datetime import datetime
import json
import scrapy
from scrapy.http import FormRequest

with open('qoli.json', 'r') as f:
    city_list = json.load(f)
@@ -42,7 +41,7 @@ class ClimateSpider(scrapy.Spider):

          yield {
              'city_url': city_url,
              'climate_timestamp': datetime.datetime.now(),
              'timestamp': datetime.now(),
              'jan_score': climate_scores[0].strip(),
              'jan_low': low_high[0][0].strip(),
              'jan_high': low_high[0][1].strip(),
diff --git a/numbeo/numbeo/spiders/qol.py b/numbeo/numbeo/spiders/qol.py
index 24b161f..0f44586 100755
--- a/numbeo/numbeo/spiders/qol.py
+++ b/numbeo/numbeo/spiders/qol.py
@@ -3,10 +3,9 @@
# SPIDER # 3 - QUALITY OF LIFE
# Use this spider AFTER numbeo_cities and numbeo_countries

import datetime
from datetime import datetime
import json
import scrapy
from scrapy.http import FormRequest

with open('cities.json', 'r') as f:
    city_list = json.load(f)
@@ -75,7 +74,7 @@ class QOLSpider(scrapy.Spider):
              'region': header[1].strip() if len(header) > 2 else '',
              'country': header[2].strip() if len(header) > 2 else header[1].strip(),
              'city_url': response.url.split('/')[-1],
              'qol_timestamp': datetime.datetime.now(),
              'timestamp': datetime.now(),
              'quality_of_life_index': quality_of_life_index,
              'purchasing_power_index': purchasing_power_index,
              'safety_index': safety_index,
-- 
2.34.0

[PATCH 3/4] spiders: scrape location data from wikipedia Export this patch

 - rename wiki_images.py to wiki_data.py and scrape location
   coordinates

 - update json2sqlite.py to include latitude and longitude columns
   in 'cities' table
---
 numbeo/json2sqlite.py                | 15 +++++-
 numbeo/numbeo/spiders/wiki_data.py   | 68 ++++++++++++++++++++++++++++
 numbeo/numbeo/spiders/wiki_images.py | 45 ------------------
 numbeo/numbeo/spiders/wiki_urls.py   |  1 -
 4 files changed, 82 insertions(+), 47 deletions(-)
 create mode 100644 numbeo/numbeo/spiders/wiki_data.py
 delete mode 100644 numbeo/numbeo/spiders/wiki_images.py

diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py
index d2e80a0..bd79961 100755
--- a/numbeo/json2sqlite.py
+++ b/numbeo/json2sqlite.py
@@ -43,6 +43,8 @@ cur.execute('''
                city_name VARCHAR(50) NOT NULL,
                region VARCHAR(50),
                country_id INTEGER NOT NULL,
                latitude DECIMAL(3, 6),
                longitude DECIMAL(3, 6),
                city_url VARCHAR(200) NOT NULL UNIQUE,
                wiki_url VARCHAR(200),
                FOREIGN KEY (country_id) REFERENCES countries(id),
@@ -82,6 +84,17 @@ with open('wiki_urls.json') as f:
                    WHERE city_id = "{city['city_id']}"
                    '''
        )
 # Add coordinates to cities table from wiki_urls.json
with open('wiki_data.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute(f'''
                    UPDATE cities
                    SET latitude = {city['latitude']},
                    longitude = {city['longitude']}
                    WHERE city_id = "{city['city_id']}"
                    '''
        )


# Create quality_of_life table
@@ -310,7 +323,7 @@ cur.execute('''
            ''')

# Fill image_urls table from images.json
with open('wiki_images.json') as f:
with open('wiki_data.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute('''
diff --git a/numbeo/numbeo/spiders/wiki_data.py b/numbeo/numbeo/spiders/wiki_data.py
new file mode 100644
index 0000000..699bb99
--- /dev/null
+++ b/numbeo/numbeo/spiders/wiki_data.py
@@ -0,0 +1,68 @@
import scrapy
import sqlite3

con = sqlite3.connect('acoli.db')
cur = con.cursor()

try:
    cities = cur.execute('''
                         SELECT city_id, wiki_url
                         FROM cities
                         WHERE wiki_url IS NOT NULL
                         '''
    ).fetchall()
except sqlite3.OperationalError:
    cities = []

url_ids = {}


class WikiDataSpider(scrapy.Spider):
    name = 'wiki_data'
    allowed_domains = ['wikipedia.org', 'geohack.toolforge.org']
    start_urls = []
    for city in cities:
        if city[1]:
            url = 'https://en.wikipedia.org' + city[1]
        else:
            name = cur.execute('''
                               SELECT city_name, region, country_name
                               FROM cities NATURAL JOIN COUNTRIES
                               '''
            ).fetchone()
            url = 'https://en.wikipedia.org/wiki/' + '_'.join(' '.join(name).split())
        url_ids[url] = city[0]
        start_urls.append(url)

    def parse(self, response):
        wiki_img = response.xpath(
            '//meta[@property="og:image"]/@content'
        ).get()

        geo_url = 'https:' + response.xpath(
                '//a/@href[contains(., "geohack")]').get()

        if geo_url:
            request = scrapy.Request(url=geo_url, callback=self.parse_geo)
            request.meta['city_id'] = url_ids[response.url]
            request.meta['image_url'] = wiki_img
            yield request
        else:
            yield {
                'city_id': url_ids[response.url],
                'image_url': wiki_img,
                'latitude': '',
                'longitude': '',
            }

    def parse_geo(self, response):
        latitude = response.xpath('//span[@class="latitude p-latitude"]/text()'
                   ).get()
        longitude = response.xpath('//span[@class="longitude p-longitude"]/text()'
                    ).get()
        yield {
            'city_id': response.meta['city_id'],
            'image_url': response.meta['image_url'],
            'latitude': latitude,
            'longitude': longitude,
        }
diff --git a/numbeo/numbeo/spiders/wiki_images.py b/numbeo/numbeo/spiders/wiki_images.py
deleted file mode 100644
index f3c583e..0000000
--- a/numbeo/numbeo/spiders/wiki_images.py
@@ -1,45 +0,0 @@
import scrapy
import sqlite3

con = sqlite3.connect('acoli.db')
cur = con.cursor()

try:
    cities = cur.execute('''
                         SELECT city_id, wiki_url
                         FROM cities
                         WHERE wiki_url IS NOT NULL
                         '''
    ).fetchall()
except sqlite3.OperationalError:
    cities = []

url_ids = {}


class WikiImagesSpider(scrapy.Spider):
    name = 'wiki_images'
    allowed_domains = ['wikipedia.org']
    start_urls = []
    for city in cities:
        if city[1]:
            url = 'https://en.wikipedia.org' + city[1]
        else:
            name = cur.execute('''
                               SELECT city_name, region, country_name
                               FROM cities NATURAL JOIN COUNTRIES
                               '''
            ).fetchone()
            url = 'https://en.wikipedia.org/wiki/' + '_'.join(' '.join(name).split())
        url_ids[url] = city[0]
        start_urls.append(url)

    def parse(self, response):
        wiki_img = response.xpath(
            '//meta[@property="og:image"]/@content'
        ).get()

        yield {
            'city_id': url_ids[response.url],
            'image_url': wiki_img,
        }
diff --git a/numbeo/numbeo/spiders/wiki_urls.py b/numbeo/numbeo/spiders/wiki_urls.py
index d30c404..9d74ea4 100755
--- a/numbeo/numbeo/spiders/wiki_urls.py
+++ b/numbeo/numbeo/spiders/wiki_urls.py
@@ -47,7 +47,6 @@ class wikiUrlSpider(scrapy.Spider):

        start_url = response.request.meta.get('redirect_urls')[0]


        yield {
            'city_id': url_ids[start_url],
            'wiki_url': wiki_url,
-- 
2.34.0

[PATCH 4/4] spiders: update wiki_data location scraping Export this patch

 - get coordinates directly from start_urls; this fixes an issue
   where data was getting mixed up when scraping concurrently
---
 numbeo/numbeo/spiders/wiki_data.py | 62 +++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/numbeo/numbeo/spiders/wiki_data.py b/numbeo/numbeo/spiders/wiki_data.py
index 699bb99..a4d19f7 100644
--- a/numbeo/numbeo/spiders/wiki_data.py
+++ b/numbeo/numbeo/spiders/wiki_data.py
@@ -17,9 +17,20 @@ except sqlite3.OperationalError:
url_ids = {}


# Convert coordinates in degree/minute/second form to decimal
def toDecimalCoordinate(coordinate):
    coordinate = coordinate.replace('′', '°').replace('″', '°').split('°')
    decimal = int(coordinate[0]) + \
              int(coordinate[1]) / 60 + \
              int(coordinate[2]) / 3600
    if coordinate[3] in ['W','S']:
        decimal = -decimal
    return decimal


class WikiDataSpider(scrapy.Spider):
    name = 'wiki_data'
    allowed_domains = ['wikipedia.org', 'geohack.toolforge.org']
    allowed_domains = ['wikipedia.org']
    start_urls = []
    for city in cities:
        if city[1]:
@@ -34,35 +45,40 @@ class WikiDataSpider(scrapy.Spider):
        url_ids[url] = city[0]
        start_urls.append(url)

    def toDecimalCoordinate(coordinate):
        coordinate = re.split('°′″')
        decimal = coordinate[0] + \
                  coordinate[1] / 60 + \
                  coordinate[2] / 3600
        if coordinate[3] in ['W','S']:
            decimal = -decimal
        return decimal

    def parse(self, response):
        wiki_img = response.xpath(
            '//meta[@property="og:image"]/@content'
        ).get()
        
        latitude = toDecimalCoordinate(
                       response.xpath('//span[@class="latitude"]/text()').get()
                   )

        geo_url = 'https:' + response.xpath(
                '//a/@href[contains(., "geohack")]').get()
        longitude = toDecimalCoordinate(
                        response.xpath('//span[@class="longitude"]/text()').get()
                    )

        if geo_url:
            request = scrapy.Request(url=geo_url, callback=self.parse_geo)
            request.meta['city_id'] = url_ids[response.url]
            request.meta['image_url'] = wiki_img
            yield request
        else:
            yield {
                'city_id': url_ids[response.url],
                'image_url': wiki_img,
                'latitude': '',
                'longitude': '',
            }

    def parse_geo(self, response):
        latitude = response.xpath('//span[@class="latitude p-latitude"]/text()'
                   ).get()
        longitude = response.xpath('//span[@class="longitude p-longitude"]/text()'
                    ).get()
        yield {
            'city_id': response.meta['city_id'],
            'image_url': response.meta['image_url'],
            'city_id': url_ids[response.url],
            'image_url': wiki_img,
            'latitude': latitude,
            'longitude': longitude,
        }

    def toDecimalCoordinate(coordinate):
        coordinate = re.split('°′″')
        decimal = coordinate[0] + \
                  coordinate[1] / 60 + \
                  coordinate[2] / 3600
        if coordinate[3] in ['W','S']:
            decimal = -decimal
        return decimal
-- 
2.34.0
alright!  i've applied all 4 commits.