Rebecca Medrano: 1 spiders: add cost_of_living.py 1 files changed, 343 insertions(+), 0 deletions(-)
Copy & paste the following snippet into your terminal to import this patchset into git:
curl -s https://lists.sr.ht/~akspecs/numbeo-scraping-dev/patches/26701/mbox | git am -3Learn more about email & git
- scrapes data from the numbeo cost_of_living page --- numbeo/numbeo/spiders/cost_of_living.py | 343 ++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 numbeo/numbeo/spiders/cost_of_living.py diff --git a/numbeo/numbeo/spiders/cost_of_living.py b/numbeo/numbeo/spiders/cost_of_living.py new file mode 100644 index 0000000..43fb20f --- /dev/null +++ b/numbeo/numbeo/spiders/cost_of_living.py @@ -0,0 +1,343 @@ +import datetime +import scrapy +import sqlite3 + +try: + con = sqlite3.connect('acoli.db') + cur = con.cursor() + cities = cur.execute(''' + SELECT city_id, city_url + FROM cities + ''').fetchall() +except sqlite3.OperationalError: + cities = [] + +url_ids = {} + +class CostOfLivingSpider(scrapy.Spider): + name = 'cost_of_living' + allowed_domains = ['numbeo.com'] + start_urls = [] + for city in cities: + url = 'https://www.numbeo.com/cost-of-living/in/' + \ + city[1] + \ + '?displayCurrency=USD' + # Keep everything in USD for comparison + url_ids[url] = city[0] + start_urls.append(url) + + def parse(self, response): + # Summary + # Family of 4 cost + family_monthly_cost = (response.xpath( + '//li[contains(., "Family of four estimated monthly costs")]//span[@class="emp_number"]/text()' + ).get() or '').replace(',','').strip('$') + # Single person cost + single_person_monthly_cost = (response.xpath( + '//li[contains(., "A single person estimated monthly costs are")]//span[@class="emp_number"]/text()' + ).get() or '').replace(',','').strip('$') + + + # Restaurant + # - Meal, Inexpensive Restaurant + # - Meal for 2 People, Mid-range Restaurant, Three-course + # - McMeal at McDonalds (or Equivalent Combo Meal) + # - Domestic Beer (1 pint draught) + # - Imported Beer (12 oz small bottle) + # - Cappuccino (regular) + # - Coke/Pepsi (12 oz small bottle) + # - Water (12 oz small bottle) + + # Markets + # - Milk (regular), (1 gallon) + # - Loaf of Fresh White Bread (1 lb) + # - Rice (white), (1 lb) + # - Eggs (regular) (12) + # - Local Cheese (1 lb) + # - Chicken Fillets (1 lb) + # - Beef Rounds (1 lb) (or Equivalent Back Leg Red Meat) + # - Apples (1 lb) + # - Banana (1 lb) + # - Oranges (1 lb) + # - Tomato (1 lb) + # - Potato (1 lb) + # - Onion (1 lb) + # - Lettuce (1 head) + # - Water (1.5 liter bottle) + # - Bottle of Wine (Mid-Range) + # - Domestic Beer (0.5 liter bottle) + # - Imported Beer (12 oz small bottle) + # - Cigarettes 20 Pack (Marlboro) + + # Transportation + # - One-way Ticket (Local Transport) + # - Monthly Pass (Regular Price) + # - Taxi Start (Normal Tariff) + # - Taxi 1 mile (Normal Tariff) + # - Taxi 1hour Waiting (Normal Tariff) + # - Gasoline (1 gallon) + # - Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car) + # - Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car) + + # Utilities (Monthly) + # - Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment + # - 1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans) + # - Internet (60 Mbps or More, Unlimited Data, Cable/ADSL) + + # Sports and Leisure + # - Fitness Club, Monthly Fee for 1 Adult + # - Tennis Court Rent (1 Hour on Weekend) + # - Cinema, International Release, 1 Seat + + # Childcare + # - Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child + # - International Primary School, Yearly for 1 Child + + # Clothing and Shoes + # - 1 Pair of Jeans (Levis 501 Or Similar) + # - 1 Summer Dress in a Chain Store (Zara, H&M, ...) + # - 1 Pair of Nike Running Shoes (Mid-Range) + # - 1 Pair of Men Leather Business Shoes + + # Rent Per Month + # - Apartment (1 bedroom) in City Centre + # - Apartment (1 bedroom) Outside of Centre + # - Apartment (3 bedrooms) in City Centre + # - Apartment (3 bedrooms) Outside of Centre + + # Buy Apartment Price + # - Price per Square Feet to Buy Apartment in City Centre + # - Price per Square Feet to Buy Apartment Outside of Centre + + # Salaries And Financing + # - Average Monthly Net Salary (After Tax) + # - Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate + data_names = [i.strip() for i in response.xpath( + '//table[@class="data_wide_table new_bar_table"]//tr//td[1]/text()' + ).getall()] + + data_dict = {} + + # data_dict: { key: [average, low, high] } + for index in data_names: + name = index if 'Non-Alcoholic' not in index \ + and 'Buffalo' not in index\ + else index.replace('Non-Alcoholic ', '')\ + .replace('Buffalo', 'Beef') + if name not in data_dict.keys(): + data_dict[name] = [] + num = len(data_dict[name]) // 3 + 1 + data_dict[name].append((response.xpath( + '//tr[td[contains(., "' + index + \ + '")]][' + str(num) + ']//td[2]//span/text()' + ).get() or '').strip('\xa0$').replace(',','')) + data_dict[name].append((response.xpath( + '//tr[td[contains(., "' + index + \ + '")]][' + str(num) + \ + ']//span[@class="barTextLeft"]/text()' + ).get() or '').strip().replace(',','')) + data_dict[name].append((response.xpath( + '//tr[td[contains(., "' + index + \ + '")]][' + str(num) + \ + ']//span[@class="barTextRight"]/text()' + ).get() or '').strip().replace(',','')) + + +# data_values_avg = [i.strip('\xa0$') \ +# for i in response.xpath( +# '//table[@class="data_wide_table new_bar_table"]//tr//td[2]//span/text()' +# ).getall()] +# data_values_low = [i.strip() \ +# for i in response.xpath( +# '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextLeft"]/text()' +# ).getall()] +# data_values_low.insert(-1, '') +# data_values_high = [i.strip() \ +# for i in response.xpath( +# '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextRight"]/text()' +# ).getall()] +# data_values_high.insert(-1, '') +# +# data_dict = dict(zip(data_names, data_values_avg)) + + # Contributors + contributions = response.xpath( + '//div[@class="align_like_price_table"]/text()' + ).get().split()[3] + contributors = response.xpath( + '//div[@class="align_like_price_table"]/text()' + ).get().split()[-3] + print(data_dict.keys()) + + yield { + 'city_id': url_ids[response.url], + 'col_timestamp': datetime.datetime.now(), + 'family_monthly_cost': family_monthly_cost, + 'single_person_monthly_cost': single_person_monthly_cost, + 'meal_inexpensive': data_dict['Meal, Inexpensive Restaurant'][0], + 'meal_inexpensive_low': data_dict['Meal, Inexpensive Restaurant'][1], + 'meal_inexpensive_high': data_dict['Meal, Inexpensive Restaurant'][2], + 'meal_for_two_midrange': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][0], + 'meal_for_two_midrange_low': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][1], + 'meal_for_two_midrange_high': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][2], + 'combo_meal': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][0], + 'combo_meal_low': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][1], + 'combo_meal_high': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][2], + 'domestic_beer_restaurant': data_dict['Domestic Beer (1 pint draught)'][0], + 'domestic_beer_restaurant_low': data_dict['Domestic Beer (1 pint draught)'][1], + 'domestic_beer_restaurant_high': data_dict['Domestic Beer (1 pint draught)'][2], + 'imported_beer_restaurant': data_dict['Imported Beer (12 oz small bottle)'][0], + 'imported_beer_restaurant_low': data_dict['Imported Beer (12 oz small bottle)'][1], + 'imported_beer_restaurant_high': data_dict['Imported Beer (12 oz small bottle)'][2], + 'cappuccino': data_dict['Cappuccino (regular)'][0], + 'cappuccino_low': data_dict['Cappuccino (regular)'][1], + 'cappuccino_high': data_dict['Cappuccino (regular)'][2], + 'coke_pepsi': data_dict['Coke/Pepsi (12 oz small bottle)'][0], + 'coke_pepsi_low': data_dict['Coke/Pepsi (12 oz small bottle)'][1], + 'coke_pepsi_high': data_dict['Coke/Pepsi (12 oz small bottle)'][2], + 'water_restaurant': data_dict['Water (12 oz small bottle)'][0], + 'water_restaurant_low': data_dict['Water (12 oz small bottle)'][1], + 'water_restaurant_high': data_dict['Water (12 oz small bottle)'][2], + 'milk': data_dict['Milk (regular), (1 gallon)'][0], + 'milk_low': data_dict['Milk (regular), (1 gallon)'][1], + 'milk_high': data_dict['Milk (regular), (1 gallon)'][2], + 'bread': data_dict['Loaf of Fresh White Bread (1 lb)'][0], + 'bread_low': data_dict['Loaf of Fresh White Bread (1 lb)'][1], + 'bread_high': data_dict['Loaf of Fresh White Bread (1 lb)'][2], + 'rice': data_dict['Rice (white), (1 lb)'][0], + 'rice_low': data_dict['Rice (white), (1 lb)'][1], + 'rice_high': data_dict['Rice (white), (1 lb)'][2], + 'eggs': data_dict['Eggs (regular) (12)'][0], + 'eggs_low': data_dict['Eggs (regular) (12)'][1], + 'eggs_high': data_dict['Eggs (regular) (12)'][2], + 'cheese': data_dict['Local Cheese (1 lb)'][0], + 'cheese_low': data_dict['Local Cheese (1 lb)'][1], + 'cheese_high': data_dict['Local Cheese (1 lb)'][2], + 'chicken': data_dict['Chicken Fillets (1 lb)'][0], + 'chicken_low': data_dict['Chicken Fillets (1 lb)'][1], + 'chicken_high': data_dict['Chicken Fillets (1 lb)'][2], + 'beef_round': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][0], + 'beef_round_low': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][1], + 'beef_round_high': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][2], + 'apples': data_dict['Apples (1 lb)'][0], + 'apples_low': data_dict['Apples (1 lb)'][1], + 'apples_high': data_dict['Apples (1 lb)'][2], + 'bananas': data_dict['Banana (1 lb)'][0], + 'bananas_low': data_dict['Banana (1 lb)'][1], + 'bananas_high': data_dict['Banana (1 lb)'][2], + 'oranges': data_dict['Oranges (1 lb)'][0], + 'oranges_low': data_dict['Oranges (1 lb)'][1], + 'oranges_high': data_dict['Oranges (1 lb)'][2], + 'tomato': data_dict['Tomato (1 lb)'][0], + 'tomato_low': data_dict['Tomato (1 lb)'][1], + 'tomato_high': data_dict['Tomato (1 lb)'][2], + 'potato': data_dict['Potato (1 lb)'][0], + 'potato_low': data_dict['Potato (1 lb)'][1], + 'potato_high': data_dict['Potato (1 lb)'][2], + 'onion': data_dict['Onion (1 lb)'][0], + 'onion_low': data_dict['Onion (1 lb)'][1], + 'onion_high': data_dict['Onion (1 lb)'][2], + 'lettuce': data_dict['Lettuce (1 head)'][0], + 'lettuce_low': data_dict['Lettuce (1 head)'][1], + 'lettuce_high': data_dict['Lettuce (1 head)'][2], + 'water_market': data_dict['Water (1.5 liter bottle)'][0], + 'water_market_low': data_dict['Water (1.5 liter bottle)'][1], + 'water_market_high': data_dict['Water (1.5 liter bottle)'][2], + 'wine': data_dict['Bottle of Wine (Mid-Range)'][0], + 'wine_low': data_dict['Bottle of Wine (Mid-Range)'][1], + 'wine_high': data_dict['Bottle of Wine (Mid-Range)'][2], + 'domestic_beer_market': data_dict['Domestic Beer (0.5 liter bottle)'][0], + 'domestic_beer_market_low': data_dict['Domestic Beer (0.5 liter bottle)'][1], + 'domestic_beer_market_high': data_dict['Domestic Beer (0.5 liter bottle)'][2], + 'imported_beer_market': data_dict['Imported Beer (12 oz small bottle)'][3], + 'imported_beer_market_low': data_dict['Imported Beer (12 oz small bottle)'][4], + 'imported_beer_market_high': data_dict['Imported Beer (12 oz small bottle)'][5], + 'cigarettes': data_dict['Cigarettes 20 Pack (Marlboro)'][0], + 'cigarettes_low': data_dict['Cigarettes 20 Pack (Marlboro)'][1], + 'cigarettes_high': data_dict['Cigarettes 20 Pack (Marlboro)'][2], + 'one_way_ticket': data_dict['One-way Ticket (Local Transport)'][0], + 'one_way_ticket_low': data_dict['One-way Ticket (Local Transport)'][1], + 'one_way_ticket_high': data_dict['One-way Ticket (Local Transport)'][2], + 'monthly_pass': data_dict['Monthly Pass (Regular Price)'][0], + 'monthly_pass_low': data_dict['Monthly Pass (Regular Price)'][1], + 'monthly_pass_high': data_dict['Monthly Pass (Regular Price)'][2], + 'taxi_start': data_dict['Taxi Start (Normal Tariff)'][0], + 'taxi_start_low': data_dict['Taxi Start (Normal Tariff)'][1], + 'taxi_start_high': data_dict['Taxi Start (Normal Tariff)'][2], + 'taxi_per_mile': data_dict['Taxi 1 mile (Normal Tariff)'][0], + 'taxi_per_mile_low': data_dict['Taxi 1 mile (Normal Tariff)'][1], + 'taxi_per_mile_high': data_dict['Taxi 1 mile (Normal Tariff)'][2], + 'taxi_per_hour': data_dict['Taxi 1hour Waiting (Normal Tariff)'][0], + 'taxi_per_hour_low': data_dict['Taxi 1hour Waiting (Normal Tariff)'][1], + 'taxi_per_hour_high': data_dict['Taxi 1hour Waiting (Normal Tariff)'][2], + 'gasoline': data_dict['Gasoline (1 gallon)'][0], + 'gasoline_low': data_dict['Gasoline (1 gallon)'][1], + 'gasoline_high': data_dict['Gasoline (1 gallon)'][2], + 'volkswagen_golf': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][0], + 'volkswagen_golf_low': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][1], + 'volkswagen_golf_high': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][2], + 'toyota_corolla': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][0], + 'toyota_corolla_low': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][1], + 'toyota_corolla_high': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][2], + 'basic_utilities': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][0], + 'basic_utilities_low': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][1], + 'basic_utilities_high': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][2], + 'prepaid_mobile_per_minute': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][0], + 'prepaid_mobile_per_minute_low': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][1], + 'prepaid_mobile_per_minute_high': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][2], + 'internet': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][0], + 'internet_low': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][1], + 'internet_high': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][2], + 'fitness_club': data_dict['Fitness Club, Monthly Fee for 1 Adult'][0], + 'fitness_club_low': data_dict['Fitness Club, Monthly Fee for 1 Adult'][1], + 'fitness_club_high': data_dict['Fitness Club, Monthly Fee for 1 Adult'][2], + 'tennis_court_per_hour': data_dict['Tennis Court Rent (1 Hour on Weekend)'][0], + 'tennis_court_per_hour_low': data_dict['Tennis Court Rent (1 Hour on Weekend)'][1], + 'tennis_court_per_hour_high': data_dict['Tennis Court Rent (1 Hour on Weekend)'][2], + 'cinema': data_dict['Cinema, International Release, 1 Seat'][0], + 'cinema_low': data_dict['Cinema, International Release, 1 Seat'][1], + 'cinema_high': data_dict['Cinema, International Release, 1 Seat'][2], + 'preschool_monthly': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][0], + 'preschool_monthly_low': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][1], + 'preschool_monthly_high': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][2], + 'primary_school_yearly': data_dict['International Primary School, Yearly for 1 Child'][0], + 'primary_school_yearly_low': data_dict['International Primary School, Yearly for 1 Child'][1], + 'primary_school_yearly_high': data_dict['International Primary School, Yearly for 1 Child'][2], + 'jeans': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][0], + 'jeans_low': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][1], + 'jeans_high': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][2], + 'dress': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][0], + 'dress_low': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][1], + 'dress_high': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][2], + 'running_shoes': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][0], + 'running_shoes_low': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][1], + 'running_shoes_high': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][2], + 'business_shoes': data_dict['1 Pair of Men Leather Business Shoes'][0], + 'business_shoes_low': data_dict['1 Pair of Men Leather Business Shoes'][1], + 'business_shoes_high': data_dict['1 Pair of Men Leather Business Shoes'][2], + 'rent_one_br_city_center': data_dict['Apartment (1 bedroom) in City Centre'][0], + 'rent_one_br_city_center_low': data_dict['Apartment (1 bedroom) in City Centre'][1], + 'rent_one_br_city_center_high': data_dict['Apartment (1 bedroom) in City Centre'][2], + 'rent_one_br_outside_center': data_dict['Apartment (1 bedroom) Outside of Centre'][0], + 'rent_one_br_outside_center_low': data_dict['Apartment (1 bedroom) Outside of Centre'][1], + 'rent_one_br_outside_center_high': data_dict['Apartment (1 bedroom) Outside of Centre'][2], + 'rent_three_br_city_center': data_dict['Apartment (3 bedrooms) in City Centre'][0], + 'rent_three_br_city_center_low': data_dict['Apartment (3 bedrooms) in City Centre'][1], + 'rent_three_br_city_center_high': data_dict['Apartment (3 bedrooms) in City Centre'][2], + 'rent_three_br_outside_center': data_dict['Apartment (3 bedrooms) Outside of Centre'][0], + 'rent_three_br_outside_center_low': data_dict['Apartment (3 bedrooms) Outside of Centre'][1], + 'rent_three_br_outside_center_high': data_dict['Apartment (3 bedrooms) Outside of Centre'][2], + 'buy_city_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment in City Centre'][0], + 'buy_city_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment in City Centre'][1], + 'buy_city_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment in City Centre'][2], + 'buy_outside_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][0], + 'buy_outside_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][1], + 'buy_outside_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][2], + 'monthly_salary': data_dict['Average Monthly Net Salary (After Tax)'][0], + 'mortgage_rate': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][0], + 'mortgage_rate_low': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][1], + 'mortgage_rate_high': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][2], + 'contributions': contributions, + 'contributors': contributors, + } + -- 2.33.0
Andrei K. <akhartch@mail.ccsf.edu>applied