Rebecca Medrano: 1 spiders: add pollution.py 1 files changed, 111 insertions(+), 0 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26672/mbox | git am -3Learn more about email & git
- scrapes data from numbeo pollution page --- numbeo/numbeo/spiders/pollution.py | 111 +++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 numbeo/numbeo/spiders/pollution.py diff --git a/numbeo/numbeo/spiders/pollution.py b/numbeo/numbeo/spiders/pollution.py new file mode 100644 index 0000000..b2c7b13 --- /dev/null +++ b/numbeo/numbeo/spiders/pollution.py @@ -0,0 +1,111 @@ +import datetime +import scrapy +import sqlite3 + +try: + con = sqlite3.connect('acoli.db') + cur = con.cursor() + cities = cur.execute(''' + SELECT city_id, city_url + FROM cities + ''').fetchall() +except sqlite3.OperationalError: + cities = [] + +url_ids = {} + +class PollutionSpider(scrapy.Spider): + name = 'pollution' + allowed_domains = ['numbeo.com'] + start_urls = [] + for city in cities: + url = 'https://www.numbeo.com/pollution/in/' + city[1] + url_ids[url] = city[0] + start_urls.append(url) + + def parse(self, response): + #pm10 + #pm2_5 + + who_pollution_levels = response.xpath( + '//table[@class="who_pollution_data_widget"]//tr//td[2]/text()').getall() + + if who_pollution_levels: + pm10 = who_pollution_levels[0] + pm2_5 = who_pollution_levels[1] + else: + pm10 = '' + pm2_5 = '' + + if response.xpath('//div[@class="no-much-data"]').get(): + pollution_exp_scale = air_quality = drinking_water_quality = \ + garbage_disposal_satisfaction = cleanliness = noise_and_light = \ + water_quality = comfort = parks_quality = contributors = '' + else: + #pollution exp scale + # don't fully understand what this is, but might want it later + pollution_exp_scale = response.xpath( + '//tr[td[text()="Pollution Exp Scale: "]]//td[2]/text()' + ).get().strip() + + ''' + The pollution page contains two tables of data: + Pollution and Cleanliness + + The indices in these tables are inverses of each other (add up to 100), + so I chose to use only the latter (where higher indices = better) + ''' + + #air quality + air_quality = response.xpath( + '//tr[td[text()="Air quality"]]//td[@class="indexValueTd"]/text()' + ).get() + #drinking water quality and accessibility + drinking_water_quality = response.xpath( + '//tr[td[text()="Drinking Water Quality and Accessibility"]]//td[@class="indexValueTd"]/text()' + ).get() + #garbage disposal satisfaction + garbage_disposal_satisfaction = response.xpath( + '//tr[td[text()="Garbage Disposal Satisfaction"]]//td[@class="indexValueTd"]/text()' + ).get() + #clean and tidy + cleanliness = response.xpath( + '//tr[td[text()="Clean and Tidy"]]//td[@class="indexValueTd"]/text()' + ).get() + #quiet and no problem with night lights (noise/light pollution) + noise_and_light = response.xpath( + '//tr[td[text()="Quiet and No Problem with Night Lights"]]//td[@class="indexValueTd"]/text()' + ).get() + #water quality + water_quality = response.xpath( + '//tr[td[text()="Water Quality"]]//td[@class="indexValueTd"]/text()' + ).get() + #comfortable to spend time in the city + comfort = response.xpath( + '//tr[td[text()="Comfortable to Spend Time in the City"]]//td[@class="indexValueTd"]/text()' + ).get() + #quality of green and parks + parks_quality = response.xpath( + '//tr[td[text()="Quality of Green and Parks"]]//td[@class="indexValueTd"]/text()' + ).get() + + # number of contributors - add to other spiders? + contributors = response.xpath( + '//span[@class="reportees"]/text()').get().split()[1] + + yield { + 'city_id': url_ids[response.url], + 'pollution_timestamp': datetime.datetime.now(), + 'pm10': pm10, + 'pm2_5': pm2_5, + 'pollution_exp_scale': pollution_exp_scale, + 'air_quality': air_quality, + 'drinking_water_quality': drinking_water_quality, + 'garbage_disposal_statisfaction': garbage_disposal_satisfaction, + 'cleanliness': cleanliness, + 'noise_and_light': noise_and_light, + 'water_quality': water_quality, + 'comfort': comfort, + 'parks_quality': parks_quality, + 'contributors': contributors, + } -- 2.33.0
Andrei K. <akhartch@mail.ccsf.edu>applied