~akspecs/numbeo-scraping-dev

json2sqlite.py: add image_urls table v1 APPLIED

Rebecca Medrano: 1
 json2sqlite.py: add image_urls table

 2 files changed, 33 insertions(+), 4 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26671/mbox | git am -3
Learn more about email & git

[PATCH] json2sqlite.py: add image_urls table Export this patch

 - add image_url table to the database to store urls to wikipedia
   images
 - add wiki_url column in cities table

spiders/climate.py: minor correction (class name)
---
 numbeo/json2sqlite.py            | 32 ++++++++++++++++++++++++++++++++
 numbeo/numbeo/spiders/climate.py |  5 +----
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/numbeo/json2sqlite.py b/numbeo/json2sqlite.py
index 27deb32..a76a107 100755
--- a/numbeo/json2sqlite.py
+++ b/numbeo/json2sqlite.py
@@ -44,6 +44,7 @@ cur.execute('''
                region VARCHAR(50),
                country_id INTEGER NOT NULL,
                city_url VARCHAR(200) NOT NULL UNIQUE,
                wiki_url VARCHAR(200),
                FOREIGN KEY (country_id) REFERENCES countries(id)
            )
            ''')
@@ -70,6 +71,17 @@ with open('qoli.json') as f:
                          city['city_url'],)
        )

# Add wikipedia urls to cities table from wiki_urls.json
with open('wiki_urls.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute(f'''
                    UPDATE cities
                    SET wiki_url = "{city['wiki_url']}"
                    WHERE city_id = "{city['city_id']}"
                    '''
        )


# Create quality_of_life table
# This table contains the most recent QOL data for each city
@@ -271,6 +283,26 @@ with open('climate.json') as f:
                          city['dec_low'], city['dec_high'],)
        )

# Create image_urls table
cur.execute('''
            CREATE TABLE IF NOT EXISTS image_urls (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            city_id INTEGER NOT NULL UNIQUE,
            image_url VARCHAR(500)
            )
            ''')

# Fill image_urls table from images.json
with open('wiki_images.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute('''
                    INSERT OR IGNORE INTO image_urls (
                    city_id, image_url)
                    VALUES (?, ?)
                    ''', (city['city_id'], city['image_url'],)
        )


# uncomment to print data being added to db
# consider implementing a flag that offers verbose output of the following
diff --git a/numbeo/numbeo/spiders/climate.py b/numbeo/numbeo/spiders/climate.py
index da8308f..19b65d6 100755
--- a/numbeo/numbeo/spiders/climate.py
+++ b/numbeo/numbeo/spiders/climate.py
@@ -1,8 +1,5 @@
#!/usr/bin/env python3

# SPIDER # 3 - QUALITY OF LIFE
# Use this spider AFTER numbeo_cities and numbeo_countries

import datetime
import json
import scrapy
@@ -11,7 +8,7 @@ from scrapy.http import FormRequest
with open('qoli.json', 'r') as f:
    city_list = json.load(f)

class QOLSpider(scrapy.Spider):
class ClimateSpider(scrapy.Spider):
    name = 'climate'
    allowed_domains = ['numbeo.com/']
    start_urls = []
-- 
2.33.0