~akspecs/numbeo-scraping-dev

json2sqlite: add qol timestamps, timeseries table v1 APPLIED

Rebecca Medrano: 1
 json2sqlite: add qol timestamps, timeseries table

 1 files changed, 77 insertions(+), 11 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26590/mbox | git am -3
Learn more about email & git

[PATCH] json2sqlite: add qol timestamps, timeseries table Export this patch

- add column in quality_of_life table for timestamps from qoli.json
- add time_series_quality_of_life table to keep timeseries data
- ensure quality_of_life table contains most recent records
---
 numbeo/json2sqlite.py | 88 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 11 deletions(-)

diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py
index d173b47..27deb32 100755
--- a/numbeo/json2sqlite.py
+++ b/numbeo/json2sqlite.py
@@ -15,7 +15,7 @@ con = sqlite3.connect('acoli.db')  # consider parametrizing this
cur = con.cursor()


# Create countries table from countries.json
# Create countries table
cur.execute('''
            CREATE TABLE IF NOT EXISTS countries (
                country_id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -24,18 +24,19 @@ cur.execute('''
            )
            ''')

# Fill countries table with data from countries.json
with open('countries.json') as f:
    countries = json.load(f)
    for country in countries[1:]:
        cur.execute('''
                    INSERT OR IGNORE INTO countries (country_name, country_url)
                    VALUES (?, ?)
                    ''', (country['country_name'], country['country_url'],)
        )
    for country in countries:
        if country['country_name'] != '\n':
            cur.execute('''
                        INSERT OR IGNORE INTO countries (country_name, country_url)
                        VALUES (?, ?)
                        ''', (country['country_name'], country['country_url'],)
            )


# Create cities table from qoli.json
# note: NOT cities.json - qoli has correct urls
# Create cities table
cur.execute('''
            CREATE TABLE IF NOT EXISTS cities (
                city_id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -47,6 +48,8 @@ cur.execute('''
            )
            ''')

# Fill cities table with data from qoli.json
# note: NOT cities.json - qoli has correct urls
with open('qoli.json') as f:
    cities = json.load(f)
    for city in cities:
@@ -68,7 +71,8 @@ with open('qoli.json') as f:
        )


# Create quality_of_life table from qoli.json
# Create quality_of_life table
# This table contains the most recent QOL data for each city
cur.execute('''
            CREATE TABLE IF NOT EXISTS quality_of_life (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -87,6 +91,30 @@ cur.execute('''
            )
            ''')

# Create time_series_quality_of_life table
# This table contains time series QOL data for each city
cur.execute('''
            CREATE TABLE IF NOT EXISTS quality_of_life (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                city_id INTEGER NOT NULL,
                qol_timestamp TIMESTAMP,
                quality_of_life_index DECIMAL(5, 2),
                purchasing_power_index DECIMAL(5, 2),
                safety_index DECIMAL(5, 2),
                health_care_index DECIMAL(5, 2),
                climate_index DECIMAL(5, 2),
                cost_of_living_index DECIMAL(5, 2),
                property_price_to_income_ratio DECIMAL(5, 2),
                traffic_commute_time_index DECIMAL(5, 2),
                pollution_index DECIMAL(5, 2),
                FOREIGN KEY (city_id) REFERENCES cities(id),
                CONSTRAINT unique_record UNIQUE (city_id, qol_timestamp)
            )
            ''')


# Fill quality_of_life and time_series_quality_of_life tables with
# data from qoli.json
with open('qoli.json') as f:
    cities = json.load(f)
    for city in cities:
@@ -95,6 +123,43 @@ with open('qoli.json') as f:
                              FROM cities
                              WHERE city_name = "{city['city']}"
                              ''').fetchone()[0]
        ts = cur.execute(f'''
                         SELECT qol_timestamp
                         FROM quality_of_life
                         WHERE city_id = "{city_id}"
                         ''').fetchone()
        # Check if there is an existing record
        # if so, compare timestamps
        if not ts or ts[0] < city['qol_timestamp']:
            # Insert into quality_of_life table or replace existing record
            cur.execute('''
                        REPLACE INTO quality_of_life (
                            city_id,
                            qol_timestamp,
                            quality_of_life_index,
                            purchasing_power_index,
                            safety_index,
                            health_care_index,
                            climate_index,
                            cost_of_living_index,
                            property_price_to_income_ratio,
                            traffic_commute_time_index,
                            pollution_index
                        )
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        ''', (city_id,
                              city['qol_timestamp'],
                              city['quality_of_life_index'],
                              city['purchasing_power_index'],
                              city['safety_index'],
                              city['health_care_index'],
                              city['climate_index'],
                              city['cost_of_living_index'],
                              city['property_price_to_income_ratio'],
                              city['traffic_commute_time_index'],
                              city['pollution_index'],)
            )
        # Insert into time series table or ignore if record exists
        cur.execute('''
                    INSERT OR IGNORE INTO quality_of_life (
                        city_id,
@@ -124,7 +189,7 @@ with open('qoli.json') as f:
        )


# Create climate table from climate.json
# Create climate table
cur.execute('''
            CREATE TABLE IF NOT EXISTS climate (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -152,6 +217,7 @@ cur.execute('''
            )
            ''')

# Fill climate table with data from climate.json
with open('climate.json') as f:
    cities = json.load(f)
    for city in cities:
-- 
2.33.0
lgtm.  applied.