~akspecs/numbeo-scraping-dev

spiders: add pollution.py v1 APPLIED

Rebecca Medrano: 1
 spiders: add pollution.py

 1 files changed, 111 insertions(+), 0 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26672/mbox | git am -3
Learn more about email & git

[PATCH] spiders: add pollution.py Export this patch

 - scrapes data from numbeo pollution page
---
 numbeo/numbeo/spiders/pollution.py | 111 +++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 numbeo/numbeo/spiders/pollution.py

diff --git a/numbeo/numbeo/spiders/pollution.py b/numbeo/numbeo/spiders/pollution.py
new file mode 100644
index 0000000..b2c7b13
--- /dev/null
+++ b/numbeo/numbeo/spiders/pollution.py
@@ -0,0 +1,111 @@
import datetime
import scrapy
import sqlite3

try:
    con = sqlite3.connect('acoli.db')
    cur = con.cursor()
    cities = cur.execute('''
                         SELECT city_id, city_url
                         FROM cities
                         ''').fetchall()
except sqlite3.OperationalError:
        cities = []

url_ids = {}

class PollutionSpider(scrapy.Spider):
    name = 'pollution'
    allowed_domains = ['numbeo.com']
    start_urls = []
    for city in cities:
        url = 'https://www.numbeo.com/pollution/in/' + city[1]
        url_ids[url] = city[0]
        start_urls.append(url)

    def parse(self, response):
        #pm10
        #pm2_5

        who_pollution_levels = response.xpath(
            '//table[@class="who_pollution_data_widget"]//tr//td[2]/text()').getall()

        if who_pollution_levels:
            pm10 = who_pollution_levels[0]
            pm2_5 = who_pollution_levels[1]
        else:
            pm10 = ''
            pm2_5 = ''

        if response.xpath('//div[@class="no-much-data"]').get():
            pollution_exp_scale = air_quality = drinking_water_quality = \
            garbage_disposal_satisfaction = cleanliness = noise_and_light = \
            water_quality = comfort = parks_quality = contributors = ''
        else:
            #pollution exp scale
            # don't fully understand what this is, but might want it later
            pollution_exp_scale = response.xpath(
                '//tr[td[text()="Pollution Exp Scale: "]]//td[2]/text()'
            ).get().strip()
    
            '''
            The pollution page contains two tables of data:
                Pollution and Cleanliness
    
            The indices in these tables are inverses of each other (add up to 100),
            so I chose to use only the latter (where higher indices = better) 
            '''
    
            #air quality
            air_quality = response.xpath(
                '//tr[td[text()="Air quality"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #drinking water quality and accessibility
            drinking_water_quality = response.xpath(
                '//tr[td[text()="Drinking Water Quality and Accessibility"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #garbage disposal satisfaction
            garbage_disposal_satisfaction = response.xpath(
                '//tr[td[text()="Garbage Disposal Satisfaction"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #clean and tidy
            cleanliness = response.xpath(
                '//tr[td[text()="Clean and Tidy"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #quiet and no problem with night lights (noise/light pollution)
            noise_and_light = response.xpath(
                '//tr[td[text()="Quiet and No Problem with Night Lights"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #water quality
            water_quality = response.xpath(
                '//tr[td[text()="Water Quality"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #comfortable to spend time in the city
            comfort = response.xpath(
                '//tr[td[text()="Comfortable to Spend Time in the City"]]//td[@class="indexValueTd"]/text()'
            ).get()
            #quality of green and parks
            parks_quality = response.xpath(
                '//tr[td[text()="Quality of Green and Parks"]]//td[@class="indexValueTd"]/text()'
            ).get()
    
            # number of contributors - add to other spiders?
            contributors = response.xpath(
                '//span[@class="reportees"]/text()').get().split()[1]
        
        yield {
            'city_id': url_ids[response.url],
            'pollution_timestamp': datetime.datetime.now(),
            'pm10': pm10,
            'pm2_5': pm2_5,
            'pollution_exp_scale': pollution_exp_scale,
            'air_quality': air_quality,
            'drinking_water_quality': drinking_water_quality,
            'garbage_disposal_statisfaction': garbage_disposal_satisfaction,
            'cleanliness': cleanliness,
            'noise_and_light': noise_and_light,
            'water_quality': water_quality,
            'comfort': comfort,
            'parks_quality': parks_quality,
            'contributors': contributors,
        }
-- 
2.33.0
applied