~akspecs/numbeo-scraping-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
1

[PATCH] spiders: add cost_of_living.py

Rebecca Medrano <rmedran4@mail.ccsf.edu>
Details
Message ID
<20211120045545.110522-1-rmedran4@mail.ccsf.edu>
DKIM signature
missing
Download raw message
Patch: +343 -0
 - scrapes data from the numbeo cost_of_living page
---
 numbeo/numbeo/spiders/cost_of_living.py | 343 ++++++++++++++++++++++++
 1 file changed, 343 insertions(+)
 create mode 100644 numbeo/numbeo/spiders/cost_of_living.py

diff --git a/numbeo/numbeo/spiders/cost_of_living.py b/numbeo/numbeo/spiders/cost_of_living.py
new file mode 100644
index 0000000..43fb20f
--- /dev/null
+++ b/numbeo/numbeo/spiders/cost_of_living.py
@@ -0,0 +1,343 @@
import datetime
import scrapy
import sqlite3

try:
    con = sqlite3.connect('acoli.db')
    cur = con.cursor()
    cities = cur.execute('''
                         SELECT city_id, city_url
                         FROM cities
                         ''').fetchall()
except sqlite3.OperationalError:
    cities = []

url_ids = {}

class CostOfLivingSpider(scrapy.Spider):
    name = 'cost_of_living'
    allowed_domains = ['numbeo.com']
    start_urls = []
    for city in cities:
        url = 'https://www.numbeo.com/cost-of-living/in/' + \
               city[1] + \
               '?displayCurrency=USD'
              # Keep everything in USD for comparison
        url_ids[url] = city[0]
        start_urls.append(url)

    def parse(self, response):
        # Summary
        # Family of 4 cost
        family_monthly_cost = (response.xpath(
            '//li[contains(., "Family of four estimated monthly costs")]//span[@class="emp_number"]/text()'
        ).get() or '').replace(',','').strip('$')
        # Single person cost
        single_person_monthly_cost = (response.xpath(
            '//li[contains(., "A single person estimated monthly costs are")]//span[@class="emp_number"]/text()'
        ).get() or '').replace(',','').strip('$')


        # Restaurant
        #    - Meal, Inexpensive Restaurant
        #    - Meal for 2 People, Mid-range Restaurant, Three-course
        #    - McMeal at McDonalds (or Equivalent Combo Meal)
        #    - Domestic Beer (1 pint draught)
        #    - Imported Beer (12 oz small bottle)
        #    - Cappuccino (regular)
        #    - Coke/Pepsi (12 oz small bottle)
        #    - Water (12 oz small bottle)

        # Markets
        #     - Milk (regular), (1 gallon)
        #     - Loaf of Fresh White Bread (1 lb)
        #     - Rice (white), (1 lb)
        #     - Eggs (regular) (12)
        #     - Local Cheese (1 lb)
        #     - Chicken Fillets (1 lb)
        #     - Beef Rounds (1 lb) (or Equivalent Back Leg Red Meat)
        #     - Apples (1 lb)
        #     - Banana (1 lb)
        #     - Oranges (1 lb)
        #     - Tomato (1 lb)
        #     - Potato (1 lb)
        #     - Onion (1 lb)
        #     - Lettuce (1 head)
        #     - Water (1.5 liter bottle)
        #     - Bottle of Wine (Mid-Range)
        #     - Domestic Beer (0.5 liter bottle)
        #     - Imported Beer (12 oz small bottle)
        #     - Cigarettes 20 Pack (Marlboro)

        # Transportation
        #     - One-way Ticket (Local Transport)
        #     - Monthly Pass (Regular Price)
        #     - Taxi Start (Normal Tariff)
        #     - Taxi 1 mile (Normal Tariff)
        #     - Taxi 1hour Waiting (Normal Tariff)
        #     - Gasoline (1 gallon)
        #     - Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)
        #     - Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)

        # Utilities (Monthly)
        #     - Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment
        #     - 1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)
        #     - Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)

        # Sports and Leisure
        #     - Fitness Club, Monthly Fee for 1 Adult
        #     - Tennis Court Rent (1 Hour on Weekend)
        #     - Cinema, International Release, 1 Seat

        # Childcare
        #     - Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child
        #     - International Primary School, Yearly for 1 Child

        # Clothing and Shoes
        #     - 1 Pair of Jeans (Levis 501 Or Similar)
        #     - 1 Summer Dress in a Chain Store (Zara, H&M, ...)
        #     - 1 Pair of Nike Running Shoes (Mid-Range)
        #     - 1 Pair of Men Leather Business Shoes

        # Rent Per Month
        #     - Apartment (1 bedroom) in City Centre
        #     - Apartment (1 bedroom) Outside of Centre
        #     - Apartment (3 bedrooms) in City Centre
        #     - Apartment (3 bedrooms) Outside of Centre

        # Buy Apartment Price
        #     - Price per Square Feet to Buy Apartment in City Centre
        #     - Price per Square Feet to Buy Apartment Outside of Centre

        # Salaries And Financing
        #     - Average Monthly Net Salary (After Tax)
        #     - Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate
        data_names = [i.strip() for i in response.xpath(
            '//table[@class="data_wide_table new_bar_table"]//tr//td[1]/text()'
        ).getall()]

        data_dict = {}

        # data_dict: { key: [average, low, high] }
        for index in data_names:
            name = index if 'Non-Alcoholic' not in index \
                             and 'Buffalo' not in index\
                         else index.replace('Non-Alcoholic ', '')\
                         .replace('Buffalo', 'Beef')
            if name not in data_dict.keys():
                data_dict[name] = []
            num = len(data_dict[name]) // 3 + 1
            data_dict[name].append((response.xpath(
                                         '//tr[td[contains(., "' + index + \
                                         '")]][' + str(num) + ']//td[2]//span/text()'
                                     ).get() or '').strip('\xa0$').replace(',',''))
            data_dict[name].append((response.xpath(
                                         '//tr[td[contains(., "' + index + \
                                         '")]][' + str(num) + \
                                         ']//span[@class="barTextLeft"]/text()'
                                     ).get() or '').strip().replace(',',''))
            data_dict[name].append((response.xpath(
                                         '//tr[td[contains(., "' + index + \
                                         '")]][' + str(num) + \
                                         ']//span[@class="barTextRight"]/text()'
                                     ).get() or '').strip().replace(',',''))


#        data_values_avg = [i.strip('\xa0$') \
#                             for i in response.xpath(
#                             '//table[@class="data_wide_table new_bar_table"]//tr//td[2]//span/text()'
#                             ).getall()]
#        data_values_low = [i.strip() \
#                           for i in response.xpath(
#                           '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextLeft"]/text()'
#                          ).getall()]
#        data_values_low.insert(-1, '')
#        data_values_high = [i.strip() \
#                           for i in response.xpath(
#                           '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextRight"]/text()'
#                          ).getall()]
#        data_values_high.insert(-1, '')
#
#        data_dict = dict(zip(data_names, data_values_avg))

        # Contributors
        contributions = response.xpath(
                            '//div[@class="align_like_price_table"]/text()'
                        ).get().split()[3]
        contributors = response.xpath(
                            '//div[@class="align_like_price_table"]/text()'
                        ).get().split()[-3]
        print(data_dict.keys())

        yield {
            'city_id': url_ids[response.url],
            'col_timestamp': datetime.datetime.now(),
            'family_monthly_cost': family_monthly_cost,
            'single_person_monthly_cost': single_person_monthly_cost,
            'meal_inexpensive': data_dict['Meal, Inexpensive Restaurant'][0],
            'meal_inexpensive_low': data_dict['Meal, Inexpensive Restaurant'][1],
            'meal_inexpensive_high': data_dict['Meal, Inexpensive Restaurant'][2],
            'meal_for_two_midrange': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][0],
            'meal_for_two_midrange_low': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][1],
            'meal_for_two_midrange_high': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][2],
            'combo_meal': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][0],
            'combo_meal_low': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][1],
            'combo_meal_high': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][2],
            'domestic_beer_restaurant': data_dict['Domestic Beer (1 pint draught)'][0],
            'domestic_beer_restaurant_low': data_dict['Domestic Beer (1 pint draught)'][1],
            'domestic_beer_restaurant_high': data_dict['Domestic Beer (1 pint draught)'][2],
            'imported_beer_restaurant': data_dict['Imported Beer (12 oz small bottle)'][0],
            'imported_beer_restaurant_low': data_dict['Imported Beer (12 oz small bottle)'][1],
            'imported_beer_restaurant_high': data_dict['Imported Beer (12 oz small bottle)'][2],
            'cappuccino': data_dict['Cappuccino (regular)'][0],
            'cappuccino_low': data_dict['Cappuccino (regular)'][1],
            'cappuccino_high': data_dict['Cappuccino (regular)'][2],
            'coke_pepsi': data_dict['Coke/Pepsi (12 oz small bottle)'][0],
            'coke_pepsi_low': data_dict['Coke/Pepsi (12 oz small bottle)'][1],
            'coke_pepsi_high': data_dict['Coke/Pepsi (12 oz small bottle)'][2],
            'water_restaurant': data_dict['Water (12 oz small bottle)'][0],
            'water_restaurant_low': data_dict['Water (12 oz small bottle)'][1],
            'water_restaurant_high': data_dict['Water (12 oz small bottle)'][2],
            'milk': data_dict['Milk (regular), (1 gallon)'][0],
            'milk_low': data_dict['Milk (regular), (1 gallon)'][1],
            'milk_high': data_dict['Milk (regular), (1 gallon)'][2],
            'bread': data_dict['Loaf of Fresh White Bread (1 lb)'][0],
            'bread_low': data_dict['Loaf of Fresh White Bread (1 lb)'][1],
            'bread_high': data_dict['Loaf of Fresh White Bread (1 lb)'][2],
            'rice': data_dict['Rice (white), (1 lb)'][0],
            'rice_low': data_dict['Rice (white), (1 lb)'][1],
            'rice_high': data_dict['Rice (white), (1 lb)'][2],
            'eggs': data_dict['Eggs (regular) (12)'][0],
            'eggs_low': data_dict['Eggs (regular) (12)'][1],
            'eggs_high': data_dict['Eggs (regular) (12)'][2],
            'cheese': data_dict['Local Cheese (1 lb)'][0],
            'cheese_low': data_dict['Local Cheese (1 lb)'][1],
            'cheese_high': data_dict['Local Cheese (1 lb)'][2],
            'chicken': data_dict['Chicken Fillets (1 lb)'][0],
            'chicken_low': data_dict['Chicken Fillets (1 lb)'][1],
            'chicken_high': data_dict['Chicken Fillets (1 lb)'][2],
            'beef_round': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][0],
            'beef_round_low': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][1],
            'beef_round_high': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][2],
            'apples': data_dict['Apples (1 lb)'][0],
            'apples_low': data_dict['Apples (1 lb)'][1],
            'apples_high': data_dict['Apples (1 lb)'][2],
            'bananas': data_dict['Banana (1 lb)'][0],
            'bananas_low': data_dict['Banana (1 lb)'][1],
            'bananas_high': data_dict['Banana (1 lb)'][2],
            'oranges': data_dict['Oranges (1 lb)'][0],
            'oranges_low': data_dict['Oranges (1 lb)'][1],
            'oranges_high': data_dict['Oranges (1 lb)'][2],
            'tomato': data_dict['Tomato (1 lb)'][0],
            'tomato_low': data_dict['Tomato (1 lb)'][1],
            'tomato_high': data_dict['Tomato (1 lb)'][2],
            'potato': data_dict['Potato (1 lb)'][0],
            'potato_low': data_dict['Potato (1 lb)'][1],
            'potato_high': data_dict['Potato (1 lb)'][2],
            'onion': data_dict['Onion (1 lb)'][0],
            'onion_low': data_dict['Onion (1 lb)'][1],
            'onion_high': data_dict['Onion (1 lb)'][2],
            'lettuce': data_dict['Lettuce (1 head)'][0],
            'lettuce_low': data_dict['Lettuce (1 head)'][1],
            'lettuce_high': data_dict['Lettuce (1 head)'][2],
            'water_market': data_dict['Water (1.5 liter bottle)'][0],
            'water_market_low': data_dict['Water (1.5 liter bottle)'][1],
            'water_market_high': data_dict['Water (1.5 liter bottle)'][2],
            'wine': data_dict['Bottle of Wine (Mid-Range)'][0],
            'wine_low': data_dict['Bottle of Wine (Mid-Range)'][1],
            'wine_high': data_dict['Bottle of Wine (Mid-Range)'][2],
            'domestic_beer_market': data_dict['Domestic Beer (0.5 liter bottle)'][0],
            'domestic_beer_market_low': data_dict['Domestic Beer (0.5 liter bottle)'][1],
            'domestic_beer_market_high': data_dict['Domestic Beer (0.5 liter bottle)'][2],
            'imported_beer_market': data_dict['Imported Beer (12 oz small bottle)'][3],
            'imported_beer_market_low': data_dict['Imported Beer (12 oz small bottle)'][4],
            'imported_beer_market_high': data_dict['Imported Beer (12 oz small bottle)'][5],
            'cigarettes': data_dict['Cigarettes 20 Pack (Marlboro)'][0],
            'cigarettes_low': data_dict['Cigarettes 20 Pack (Marlboro)'][1],
            'cigarettes_high': data_dict['Cigarettes 20 Pack (Marlboro)'][2],
            'one_way_ticket': data_dict['One-way Ticket (Local Transport)'][0],
            'one_way_ticket_low': data_dict['One-way Ticket (Local Transport)'][1],
            'one_way_ticket_high': data_dict['One-way Ticket (Local Transport)'][2],
            'monthly_pass': data_dict['Monthly Pass (Regular Price)'][0],
            'monthly_pass_low': data_dict['Monthly Pass (Regular Price)'][1],
            'monthly_pass_high': data_dict['Monthly Pass (Regular Price)'][2],
            'taxi_start': data_dict['Taxi Start (Normal Tariff)'][0],
            'taxi_start_low': data_dict['Taxi Start (Normal Tariff)'][1],
            'taxi_start_high': data_dict['Taxi Start (Normal Tariff)'][2],
            'taxi_per_mile': data_dict['Taxi 1 mile (Normal Tariff)'][0],
            'taxi_per_mile_low': data_dict['Taxi 1 mile (Normal Tariff)'][1],
            'taxi_per_mile_high': data_dict['Taxi 1 mile (Normal Tariff)'][2],
            'taxi_per_hour': data_dict['Taxi 1hour Waiting (Normal Tariff)'][0],
            'taxi_per_hour_low': data_dict['Taxi 1hour Waiting (Normal Tariff)'][1],
            'taxi_per_hour_high': data_dict['Taxi 1hour Waiting (Normal Tariff)'][2],
            'gasoline': data_dict['Gasoline (1 gallon)'][0],
            'gasoline_low': data_dict['Gasoline (1 gallon)'][1],
            'gasoline_high': data_dict['Gasoline (1 gallon)'][2],
            'volkswagen_golf': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][0],
            'volkswagen_golf_low': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][1],
            'volkswagen_golf_high': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][2],
            'toyota_corolla': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][0],
            'toyota_corolla_low': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][1],
            'toyota_corolla_high': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][2],
            'basic_utilities': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][0],
            'basic_utilities_low': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][1],
            'basic_utilities_high': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][2],
            'prepaid_mobile_per_minute': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][0],
            'prepaid_mobile_per_minute_low': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][1],
            'prepaid_mobile_per_minute_high': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][2],
            'internet': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][0],
            'internet_low': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][1],
            'internet_high': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][2],
            'fitness_club': data_dict['Fitness Club, Monthly Fee for 1 Adult'][0],
            'fitness_club_low': data_dict['Fitness Club, Monthly Fee for 1 Adult'][1],
            'fitness_club_high': data_dict['Fitness Club, Monthly Fee for 1 Adult'][2],
            'tennis_court_per_hour': data_dict['Tennis Court Rent (1 Hour on Weekend)'][0],
            'tennis_court_per_hour_low': data_dict['Tennis Court Rent (1 Hour on Weekend)'][1],
            'tennis_court_per_hour_high': data_dict['Tennis Court Rent (1 Hour on Weekend)'][2],
            'cinema': data_dict['Cinema, International Release, 1 Seat'][0],
            'cinema_low': data_dict['Cinema, International Release, 1 Seat'][1],
            'cinema_high': data_dict['Cinema, International Release, 1 Seat'][2],
            'preschool_monthly': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][0],
            'preschool_monthly_low': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][1],
            'preschool_monthly_high': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][2],
            'primary_school_yearly': data_dict['International Primary School, Yearly for 1 Child'][0],
            'primary_school_yearly_low': data_dict['International Primary School, Yearly for 1 Child'][1],
            'primary_school_yearly_high': data_dict['International Primary School, Yearly for 1 Child'][2],
            'jeans': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][0],
            'jeans_low': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][1],
            'jeans_high': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][2],
            'dress': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][0],
            'dress_low': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][1],
            'dress_high': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][2],
            'running_shoes': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][0],
            'running_shoes_low': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][1],
            'running_shoes_high': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][2],
            'business_shoes': data_dict['1 Pair of Men Leather Business Shoes'][0],
            'business_shoes_low': data_dict['1 Pair of Men Leather Business Shoes'][1],
            'business_shoes_high': data_dict['1 Pair of Men Leather Business Shoes'][2],
            'rent_one_br_city_center': data_dict['Apartment (1 bedroom) in City Centre'][0],
            'rent_one_br_city_center_low': data_dict['Apartment (1 bedroom) in City Centre'][1],
            'rent_one_br_city_center_high': data_dict['Apartment (1 bedroom) in City Centre'][2],
            'rent_one_br_outside_center': data_dict['Apartment (1 bedroom) Outside of Centre'][0],
            'rent_one_br_outside_center_low': data_dict['Apartment (1 bedroom) Outside of Centre'][1],
            'rent_one_br_outside_center_high': data_dict['Apartment (1 bedroom) Outside of Centre'][2],
            'rent_three_br_city_center': data_dict['Apartment (3 bedrooms) in City Centre'][0],
            'rent_three_br_city_center_low': data_dict['Apartment (3 bedrooms) in City Centre'][1],
            'rent_three_br_city_center_high': data_dict['Apartment (3 bedrooms) in City Centre'][2],
            'rent_three_br_outside_center': data_dict['Apartment (3 bedrooms) Outside of Centre'][0],
            'rent_three_br_outside_center_low': data_dict['Apartment (3 bedrooms) Outside of Centre'][1],
            'rent_three_br_outside_center_high': data_dict['Apartment (3 bedrooms) Outside of Centre'][2],
            'buy_city_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment in City Centre'][0],
            'buy_city_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment in City Centre'][1],
            'buy_city_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment in City Centre'][2],
            'buy_outside_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][0],
            'buy_outside_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][1],
            'buy_outside_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][2],
            'monthly_salary': data_dict['Average Monthly Net Salary (After Tax)'][0],
            'mortgage_rate': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][0],
            'mortgage_rate_low': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][1],
            'mortgage_rate_high': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][2],
            'contributions': contributions,
            'contributors': contributors,
        }

-- 
2.33.0
Details
Message ID
<CFUERCJCNBYI.29FKETPZPWC9@nano>
In-Reply-To
<20211120045545.110522-1-rmedran4@mail.ccsf.edu> (view parent)
DKIM signature
missing
Download raw message
applied
Reply to thread Export thread (mbox)