- scrapes data from the numbeo cost_of_living page
---
numbeo/numbeo/spiders/cost_of_living.py | 343 ++++++++++++++++++++++++
1 file changed, 343 insertions(+)
create mode 100644 numbeo/numbeo/spiders/cost_of_living.py
diff --git a/numbeo/numbeo/spiders/cost_of_living.py b/numbeo/numbeo/spiders/cost_of_living.py
new file mode 100644
index 0000000..43fb20f
--- /dev/null
+++ b/numbeo/numbeo/spiders/cost_of_living.py
@@ -0,0 +1,343 @@
+import datetime
+import scrapy
+import sqlite3
+
+try:
+ con = sqlite3.connect('acoli.db')
+ cur = con.cursor()
+ cities = cur.execute('''
+ SELECT city_id, city_url
+ FROM cities
+ ''').fetchall()
+except sqlite3.OperationalError:
+ cities = []
+
+url_ids = {}
+
+class CostOfLivingSpider(scrapy.Spider):
+ name = 'cost_of_living'
+ allowed_domains = ['numbeo.com']
+ start_urls = []
+ for city in cities:
+ url = 'https://www.numbeo.com/cost-of-living/in/' + \
+ city[1] + \
+ '?displayCurrency=USD'
+ # Keep everything in USD for comparison
+ url_ids[url] = city[0]
+ start_urls.append(url)
+
+ def parse(self, response):
+ # Summary
+ # Family of 4 cost
+ family_monthly_cost = (response.xpath(
+ '//li[contains(., "Family of four estimated monthly costs")]//span[@class="emp_number"]/text()'
+ ).get() or '').replace(',','').strip('$')
+ # Single person cost
+ single_person_monthly_cost = (response.xpath(
+ '//li[contains(., "A single person estimated monthly costs are")]//span[@class="emp_number"]/text()'
+ ).get() or '').replace(',','').strip('$')
+
+
+ # Restaurant
+ # - Meal, Inexpensive Restaurant
+ # - Meal for 2 People, Mid-range Restaurant, Three-course
+ # - McMeal at McDonalds (or Equivalent Combo Meal)
+ # - Domestic Beer (1 pint draught)
+ # - Imported Beer (12 oz small bottle)
+ # - Cappuccino (regular)
+ # - Coke/Pepsi (12 oz small bottle)
+ # - Water (12 oz small bottle)
+
+ # Markets
+ # - Milk (regular), (1 gallon)
+ # - Loaf of Fresh White Bread (1 lb)
+ # - Rice (white), (1 lb)
+ # - Eggs (regular) (12)
+ # - Local Cheese (1 lb)
+ # - Chicken Fillets (1 lb)
+ # - Beef Rounds (1 lb) (or Equivalent Back Leg Red Meat)
+ # - Apples (1 lb)
+ # - Banana (1 lb)
+ # - Oranges (1 lb)
+ # - Tomato (1 lb)
+ # - Potato (1 lb)
+ # - Onion (1 lb)
+ # - Lettuce (1 head)
+ # - Water (1.5 liter bottle)
+ # - Bottle of Wine (Mid-Range)
+ # - Domestic Beer (0.5 liter bottle)
+ # - Imported Beer (12 oz small bottle)
+ # - Cigarettes 20 Pack (Marlboro)
+
+ # Transportation
+ # - One-way Ticket (Local Transport)
+ # - Monthly Pass (Regular Price)
+ # - Taxi Start (Normal Tariff)
+ # - Taxi 1 mile (Normal Tariff)
+ # - Taxi 1hour Waiting (Normal Tariff)
+ # - Gasoline (1 gallon)
+ # - Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)
+ # - Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)
+
+ # Utilities (Monthly)
+ # - Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment
+ # - 1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)
+ # - Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)
+
+ # Sports and Leisure
+ # - Fitness Club, Monthly Fee for 1 Adult
+ # - Tennis Court Rent (1 Hour on Weekend)
+ # - Cinema, International Release, 1 Seat
+
+ # Childcare
+ # - Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child
+ # - International Primary School, Yearly for 1 Child
+
+ # Clothing and Shoes
+ # - 1 Pair of Jeans (Levis 501 Or Similar)
+ # - 1 Summer Dress in a Chain Store (Zara, H&M, ...)
+ # - 1 Pair of Nike Running Shoes (Mid-Range)
+ # - 1 Pair of Men Leather Business Shoes
+
+ # Rent Per Month
+ # - Apartment (1 bedroom) in City Centre
+ # - Apartment (1 bedroom) Outside of Centre
+ # - Apartment (3 bedrooms) in City Centre
+ # - Apartment (3 bedrooms) Outside of Centre
+
+ # Buy Apartment Price
+ # - Price per Square Feet to Buy Apartment in City Centre
+ # - Price per Square Feet to Buy Apartment Outside of Centre
+
+ # Salaries And Financing
+ # - Average Monthly Net Salary (After Tax)
+ # - Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate
+ data_names = [i.strip() for i in response.xpath(
+ '//table[@class="data_wide_table new_bar_table"]//tr//td[1]/text()'
+ ).getall()]
+
+ data_dict = {}
+
+ # data_dict: { key: [average, low, high] }
+ for index in data_names:
+ name = index if 'Non-Alcoholic' not in index \
+ and 'Buffalo' not in index\
+ else index.replace('Non-Alcoholic ', '')\
+ .replace('Buffalo', 'Beef')
+ if name not in data_dict.keys():
+ data_dict[name] = []
+ num = len(data_dict[name]) // 3 + 1
+ data_dict[name].append((response.xpath(
+ '//tr[td[contains(., "' + index + \
+ '")]][' + str(num) + ']//td[2]//span/text()'
+ ).get() or '').strip('\xa0$').replace(',',''))
+ data_dict[name].append((response.xpath(
+ '//tr[td[contains(., "' + index + \
+ '")]][' + str(num) + \
+ ']//span[@class="barTextLeft"]/text()'
+ ).get() or '').strip().replace(',',''))
+ data_dict[name].append((response.xpath(
+ '//tr[td[contains(., "' + index + \
+ '")]][' + str(num) + \
+ ']//span[@class="barTextRight"]/text()'
+ ).get() or '').strip().replace(',',''))
+
+
+# data_values_avg = [i.strip('\xa0$') \
+# for i in response.xpath(
+# '//table[@class="data_wide_table new_bar_table"]//tr//td[2]//span/text()'
+# ).getall()]
+# data_values_low = [i.strip() \
+# for i in response.xpath(
+# '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextLeft"]/text()'
+# ).getall()]
+# data_values_low.insert(-1, '')
+# data_values_high = [i.strip() \
+# for i in response.xpath(
+# '//table[@class="data_wide_table new_bar_table"]//span[@class="barTextRight"]/text()'
+# ).getall()]
+# data_values_high.insert(-1, '')
+#
+# data_dict = dict(zip(data_names, data_values_avg))
+
+ # Contributors
+ contributions = response.xpath(
+ '//div[@class="align_like_price_table"]/text()'
+ ).get().split()[3]
+ contributors = response.xpath(
+ '//div[@class="align_like_price_table"]/text()'
+ ).get().split()[-3]
+ print(data_dict.keys())
+
+ yield {
+ 'city_id': url_ids[response.url],
+ 'col_timestamp': datetime.datetime.now(),
+ 'family_monthly_cost': family_monthly_cost,
+ 'single_person_monthly_cost': single_person_monthly_cost,
+ 'meal_inexpensive': data_dict['Meal, Inexpensive Restaurant'][0],
+ 'meal_inexpensive_low': data_dict['Meal, Inexpensive Restaurant'][1],
+ 'meal_inexpensive_high': data_dict['Meal, Inexpensive Restaurant'][2],
+ 'meal_for_two_midrange': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][0],
+ 'meal_for_two_midrange_low': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][1],
+ 'meal_for_two_midrange_high': data_dict['Meal for 2 People, Mid-range Restaurant, Three-course'][2],
+ 'combo_meal': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][0],
+ 'combo_meal_low': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][1],
+ 'combo_meal_high': data_dict['McMeal at McDonalds (or Equivalent Combo Meal)'][2],
+ 'domestic_beer_restaurant': data_dict['Domestic Beer (1 pint draught)'][0],
+ 'domestic_beer_restaurant_low': data_dict['Domestic Beer (1 pint draught)'][1],
+ 'domestic_beer_restaurant_high': data_dict['Domestic Beer (1 pint draught)'][2],
+ 'imported_beer_restaurant': data_dict['Imported Beer (12 oz small bottle)'][0],
+ 'imported_beer_restaurant_low': data_dict['Imported Beer (12 oz small bottle)'][1],
+ 'imported_beer_restaurant_high': data_dict['Imported Beer (12 oz small bottle)'][2],
+ 'cappuccino': data_dict['Cappuccino (regular)'][0],
+ 'cappuccino_low': data_dict['Cappuccino (regular)'][1],
+ 'cappuccino_high': data_dict['Cappuccino (regular)'][2],
+ 'coke_pepsi': data_dict['Coke/Pepsi (12 oz small bottle)'][0],
+ 'coke_pepsi_low': data_dict['Coke/Pepsi (12 oz small bottle)'][1],
+ 'coke_pepsi_high': data_dict['Coke/Pepsi (12 oz small bottle)'][2],
+ 'water_restaurant': data_dict['Water (12 oz small bottle)'][0],
+ 'water_restaurant_low': data_dict['Water (12 oz small bottle)'][1],
+ 'water_restaurant_high': data_dict['Water (12 oz small bottle)'][2],
+ 'milk': data_dict['Milk (regular), (1 gallon)'][0],
+ 'milk_low': data_dict['Milk (regular), (1 gallon)'][1],
+ 'milk_high': data_dict['Milk (regular), (1 gallon)'][2],
+ 'bread': data_dict['Loaf of Fresh White Bread (1 lb)'][0],
+ 'bread_low': data_dict['Loaf of Fresh White Bread (1 lb)'][1],
+ 'bread_high': data_dict['Loaf of Fresh White Bread (1 lb)'][2],
+ 'rice': data_dict['Rice (white), (1 lb)'][0],
+ 'rice_low': data_dict['Rice (white), (1 lb)'][1],
+ 'rice_high': data_dict['Rice (white), (1 lb)'][2],
+ 'eggs': data_dict['Eggs (regular) (12)'][0],
+ 'eggs_low': data_dict['Eggs (regular) (12)'][1],
+ 'eggs_high': data_dict['Eggs (regular) (12)'][2],
+ 'cheese': data_dict['Local Cheese (1 lb)'][0],
+ 'cheese_low': data_dict['Local Cheese (1 lb)'][1],
+ 'cheese_high': data_dict['Local Cheese (1 lb)'][2],
+ 'chicken': data_dict['Chicken Fillets (1 lb)'][0],
+ 'chicken_low': data_dict['Chicken Fillets (1 lb)'][1],
+ 'chicken_high': data_dict['Chicken Fillets (1 lb)'][2],
+ 'beef_round': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][0],
+ 'beef_round_low': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][1],
+ 'beef_round_high': data_dict['Beef Round (1 lb) (or Equivalent Back Leg Red Meat)'][2],
+ 'apples': data_dict['Apples (1 lb)'][0],
+ 'apples_low': data_dict['Apples (1 lb)'][1],
+ 'apples_high': data_dict['Apples (1 lb)'][2],
+ 'bananas': data_dict['Banana (1 lb)'][0],
+ 'bananas_low': data_dict['Banana (1 lb)'][1],
+ 'bananas_high': data_dict['Banana (1 lb)'][2],
+ 'oranges': data_dict['Oranges (1 lb)'][0],
+ 'oranges_low': data_dict['Oranges (1 lb)'][1],
+ 'oranges_high': data_dict['Oranges (1 lb)'][2],
+ 'tomato': data_dict['Tomato (1 lb)'][0],
+ 'tomato_low': data_dict['Tomato (1 lb)'][1],
+ 'tomato_high': data_dict['Tomato (1 lb)'][2],
+ 'potato': data_dict['Potato (1 lb)'][0],
+ 'potato_low': data_dict['Potato (1 lb)'][1],
+ 'potato_high': data_dict['Potato (1 lb)'][2],
+ 'onion': data_dict['Onion (1 lb)'][0],
+ 'onion_low': data_dict['Onion (1 lb)'][1],
+ 'onion_high': data_dict['Onion (1 lb)'][2],
+ 'lettuce': data_dict['Lettuce (1 head)'][0],
+ 'lettuce_low': data_dict['Lettuce (1 head)'][1],
+ 'lettuce_high': data_dict['Lettuce (1 head)'][2],
+ 'water_market': data_dict['Water (1.5 liter bottle)'][0],
+ 'water_market_low': data_dict['Water (1.5 liter bottle)'][1],
+ 'water_market_high': data_dict['Water (1.5 liter bottle)'][2],
+ 'wine': data_dict['Bottle of Wine (Mid-Range)'][0],
+ 'wine_low': data_dict['Bottle of Wine (Mid-Range)'][1],
+ 'wine_high': data_dict['Bottle of Wine (Mid-Range)'][2],
+ 'domestic_beer_market': data_dict['Domestic Beer (0.5 liter bottle)'][0],
+ 'domestic_beer_market_low': data_dict['Domestic Beer (0.5 liter bottle)'][1],
+ 'domestic_beer_market_high': data_dict['Domestic Beer (0.5 liter bottle)'][2],
+ 'imported_beer_market': data_dict['Imported Beer (12 oz small bottle)'][3],
+ 'imported_beer_market_low': data_dict['Imported Beer (12 oz small bottle)'][4],
+ 'imported_beer_market_high': data_dict['Imported Beer (12 oz small bottle)'][5],
+ 'cigarettes': data_dict['Cigarettes 20 Pack (Marlboro)'][0],
+ 'cigarettes_low': data_dict['Cigarettes 20 Pack (Marlboro)'][1],
+ 'cigarettes_high': data_dict['Cigarettes 20 Pack (Marlboro)'][2],
+ 'one_way_ticket': data_dict['One-way Ticket (Local Transport)'][0],
+ 'one_way_ticket_low': data_dict['One-way Ticket (Local Transport)'][1],
+ 'one_way_ticket_high': data_dict['One-way Ticket (Local Transport)'][2],
+ 'monthly_pass': data_dict['Monthly Pass (Regular Price)'][0],
+ 'monthly_pass_low': data_dict['Monthly Pass (Regular Price)'][1],
+ 'monthly_pass_high': data_dict['Monthly Pass (Regular Price)'][2],
+ 'taxi_start': data_dict['Taxi Start (Normal Tariff)'][0],
+ 'taxi_start_low': data_dict['Taxi Start (Normal Tariff)'][1],
+ 'taxi_start_high': data_dict['Taxi Start (Normal Tariff)'][2],
+ 'taxi_per_mile': data_dict['Taxi 1 mile (Normal Tariff)'][0],
+ 'taxi_per_mile_low': data_dict['Taxi 1 mile (Normal Tariff)'][1],
+ 'taxi_per_mile_high': data_dict['Taxi 1 mile (Normal Tariff)'][2],
+ 'taxi_per_hour': data_dict['Taxi 1hour Waiting (Normal Tariff)'][0],
+ 'taxi_per_hour_low': data_dict['Taxi 1hour Waiting (Normal Tariff)'][1],
+ 'taxi_per_hour_high': data_dict['Taxi 1hour Waiting (Normal Tariff)'][2],
+ 'gasoline': data_dict['Gasoline (1 gallon)'][0],
+ 'gasoline_low': data_dict['Gasoline (1 gallon)'][1],
+ 'gasoline_high': data_dict['Gasoline (1 gallon)'][2],
+ 'volkswagen_golf': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][0],
+ 'volkswagen_golf_low': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][1],
+ 'volkswagen_golf_high': data_dict['Volkswagen Golf 1.4 90 KW Trendline (Or Equivalent New Car)'][2],
+ 'toyota_corolla': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][0],
+ 'toyota_corolla_low': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][1],
+ 'toyota_corolla_high': data_dict['Toyota Corolla Sedan 1.6l 97kW Comfort (Or Equivalent New Car)'][2],
+ 'basic_utilities': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][0],
+ 'basic_utilities_low': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][1],
+ 'basic_utilities_high': data_dict['Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment'][2],
+ 'prepaid_mobile_per_minute': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][0],
+ 'prepaid_mobile_per_minute_low': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][1],
+ 'prepaid_mobile_per_minute_high': data_dict['1 min. of Prepaid Mobile Tariff Local (No Discounts or Plans)'][2],
+ 'internet': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][0],
+ 'internet_low': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][1],
+ 'internet_high': data_dict['Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)'][2],
+ 'fitness_club': data_dict['Fitness Club, Monthly Fee for 1 Adult'][0],
+ 'fitness_club_low': data_dict['Fitness Club, Monthly Fee for 1 Adult'][1],
+ 'fitness_club_high': data_dict['Fitness Club, Monthly Fee for 1 Adult'][2],
+ 'tennis_court_per_hour': data_dict['Tennis Court Rent (1 Hour on Weekend)'][0],
+ 'tennis_court_per_hour_low': data_dict['Tennis Court Rent (1 Hour on Weekend)'][1],
+ 'tennis_court_per_hour_high': data_dict['Tennis Court Rent (1 Hour on Weekend)'][2],
+ 'cinema': data_dict['Cinema, International Release, 1 Seat'][0],
+ 'cinema_low': data_dict['Cinema, International Release, 1 Seat'][1],
+ 'cinema_high': data_dict['Cinema, International Release, 1 Seat'][2],
+ 'preschool_monthly': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][0],
+ 'preschool_monthly_low': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][1],
+ 'preschool_monthly_high': data_dict['Preschool (or Kindergarten), Full Day, Private, Monthly for 1 Child'][2],
+ 'primary_school_yearly': data_dict['International Primary School, Yearly for 1 Child'][0],
+ 'primary_school_yearly_low': data_dict['International Primary School, Yearly for 1 Child'][1],
+ 'primary_school_yearly_high': data_dict['International Primary School, Yearly for 1 Child'][2],
+ 'jeans': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][0],
+ 'jeans_low': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][1],
+ 'jeans_high': data_dict['1 Pair of Jeans (Levis 501 Or Similar)'][2],
+ 'dress': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][0],
+ 'dress_low': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][1],
+ 'dress_high': data_dict['1 Summer Dress in a Chain Store (Zara, H&M, ...)'][2],
+ 'running_shoes': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][0],
+ 'running_shoes_low': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][1],
+ 'running_shoes_high': data_dict['1 Pair of Nike Running Shoes (Mid-Range)'][2],
+ 'business_shoes': data_dict['1 Pair of Men Leather Business Shoes'][0],
+ 'business_shoes_low': data_dict['1 Pair of Men Leather Business Shoes'][1],
+ 'business_shoes_high': data_dict['1 Pair of Men Leather Business Shoes'][2],
+ 'rent_one_br_city_center': data_dict['Apartment (1 bedroom) in City Centre'][0],
+ 'rent_one_br_city_center_low': data_dict['Apartment (1 bedroom) in City Centre'][1],
+ 'rent_one_br_city_center_high': data_dict['Apartment (1 bedroom) in City Centre'][2],
+ 'rent_one_br_outside_center': data_dict['Apartment (1 bedroom) Outside of Centre'][0],
+ 'rent_one_br_outside_center_low': data_dict['Apartment (1 bedroom) Outside of Centre'][1],
+ 'rent_one_br_outside_center_high': data_dict['Apartment (1 bedroom) Outside of Centre'][2],
+ 'rent_three_br_city_center': data_dict['Apartment (3 bedrooms) in City Centre'][0],
+ 'rent_three_br_city_center_low': data_dict['Apartment (3 bedrooms) in City Centre'][1],
+ 'rent_three_br_city_center_high': data_dict['Apartment (3 bedrooms) in City Centre'][2],
+ 'rent_three_br_outside_center': data_dict['Apartment (3 bedrooms) Outside of Centre'][0],
+ 'rent_three_br_outside_center_low': data_dict['Apartment (3 bedrooms) Outside of Centre'][1],
+ 'rent_three_br_outside_center_high': data_dict['Apartment (3 bedrooms) Outside of Centre'][2],
+ 'buy_city_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment in City Centre'][0],
+ 'buy_city_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment in City Centre'][1],
+ 'buy_city_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment in City Centre'][2],
+ 'buy_outside_center_per_sqft': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][0],
+ 'buy_outside_center_per_sqft_low': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][1],
+ 'buy_outside_center_per_sqft_high': data_dict['Price per Square Feet to Buy Apartment Outside of Centre'][2],
+ 'monthly_salary': data_dict['Average Monthly Net Salary (After Tax)'][0],
+ 'mortgage_rate': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][0],
+ 'mortgage_rate_low': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][1],
+ 'mortgage_rate_high': data_dict['Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate'][2],
+ 'contributions': contributions,
+ 'contributors': contributors,
+ }
+
--
2.33.0