diff --git a/etl/README.md b/etl/README.md index 0a5ea2505..06c8d2b19 100644 --- a/etl/README.md +++ b/etl/README.md @@ -8,6 +8,7 @@ The scripts in this directory are used to extract, transform and load (ETL) the - :penguin: [Making data available to Ubuntu](#penguin-making-data-available-to-ubuntu) - :new_moon: [Creating a Colouring London database from scratch](#new_moon-creating-a-colouring-london-database-from-scratch) - :full_moon: [Updating the Colouring London database with new OS data](#full_moon-updating-the-colouring-london-database-with-new-os-data) +- ⚡ [Adding EPC data](#-adding-epc-data) # :arrow_down: Downloading Ordnance Survey data @@ -175,4 +176,32 @@ Mark buildings with geometries not present in the update as demolished. **TODO:** Update this after PR [#794](https://github.com/colouring-cities/colouring-london/pull/794) -Run the Colouring London [deployment scripts](https://github.com/colouring-cities/colouring-london-config#deployment). \ No newline at end of file +Run the Colouring London [deployment scripts](https://github.com/colouring-cities/colouring-london-config#deployment). + +# ⚡ Adding EPC data + +Download the EPC data. + +``` +git clone https://github.com/iagw/colouring-cities +``` + +Copy `gla-epc-subset.zstd.parquet` into `colouring-london/etl`. + +``` +cp /path/to/gla-epc-subset.zstd.parquet /path/to/colouring-london/etl +``` + +Run a conversion to csv (make sure you have an up to date Python 3 environment and pip installation and run `pip install -r requirements.txt` first if you haven't already). + +``` +python clean_epc_data.py +``` + +This should have created a csv in the `/etl` dir called `'gla-epc-subset.csv'`. + +Create a new table for the EPC data and load the csv data into it (if you didn't already, don't forget to change the permissions so this file can be run `chmod +x *.sh`) + +``` +./load_epc.sh +``` \ No newline at end of file diff --git a/etl/__init__.py b/etl/__init__.py index a9f46b583..1cceea6af 100644 --- a/etl/__init__.py +++ b/etl/__init__.py @@ -1 +1,2 @@ -from .filter_mastermap import filter_mastermap \ No newline at end of file +from .filter_mastermap import filter_mastermap +from .epc_cleaning_functions import floor_level_to_int, construction_to_int \ No newline at end of file diff --git a/etl/clean_epc_data.py b/etl/clean_epc_data.py new file mode 100644 index 000000000..921b206a3 --- /dev/null +++ b/etl/clean_epc_data.py @@ -0,0 +1,36 @@ +# # Instructions +# +# 1. Download the GLA EPC data from GitHub in parquet format: +# github.com/iagw/colouring-cities/blob/master/gla-epc-subset.zstd.parquet +# 2. Place the file in `colouring-london/etl` +# 3. Run this script to convert it to CSV for easy loading into Postgres + +import pandas as pd +from epc_cleaning_functions import floor_level_to_int, construction_to_int + +gla = pd.read_parquet('gla-epc-subset.zstd.parquet') + +# Remove invalid CURRENT_ENERGY_RATING +gla = gla.replace('INVALID!', None) + +# Clean the FLOOR_LEVEL column +gla['FLOOR_LEVEL'] = gla['FLOOR_LEVEL'].apply(floor_level_to_int) + +# Clean the CONSTRUCTION_AGE_BAND column +gla['CONSTRUCTION_AGE_BAND'] = gla['CONSTRUCTION_AGE_BAND'].apply(construction_to_int) # noqa: E501 + +# Remove NaNs and non finite values +with pd.option_context('mode.use_inf_as_null', True): + gla.dropna(inplace=True) + +# Ensure int not float +gla['CONSTRUCTION_AGE_BAND'] = gla['CONSTRUCTION_AGE_BAND'].astype(int) + +# Ensure int not float +gla['FLOOR_LEVEL'] = gla['FLOOR_LEVEL'].astype(int) + +# Ensure int not float +gla['UPRN'] = gla['UPRN'].astype(int) + +# Export to csv +gla.to_csv('gla-epc-subset.csv') diff --git a/etl/epc_cleaning_functions.py b/etl/epc_cleaning_functions.py new file mode 100644 index 000000000..ec5f8dc59 --- /dev/null +++ b/etl/epc_cleaning_functions.py @@ -0,0 +1,52 @@ +def floor_level_to_int(lvl): + """Convert differently formatted floor level strings to ints. + As you can see below, there are some assumptions made such as + the 'top floor' being 2. This has been done so we can get an int value + for the floor for each building automatically populated by EPC data. + Incorrect assumptions can be updated later via the Colouring London + interface. + """ + if lvl is None: + return None + elif type(lvl) == int: + return lvl + # else assume we have a string + ordinals = ['st', 'nd', 'rd', 'th'] + lvl = lvl.replace('or above', '') + lvl = lvl.replace('+', '') + try: + return int(lvl) + except ValueError: + if 'Ground' in lvl or 'ground' in lvl: + lvl = 0 + elif 'basement' in lvl or 'Basement' in lvl: + lvl = -1 + elif lvl == 'mid floor': + lvl = 1 + elif lvl == 'top floor': + lvl = 2 + elif lvl[0] == '0' and lvl != '0': + lvl = lvl[1] + elif any(ordinal in lvl for ordinal in ordinals): + for ordinal in ordinals: + lvl = lvl.replace(ordinal, '') + else: + return None + return int(lvl) + + +def construction_to_int(year): + if year is None: + return None + elif type(year) == int: + return year + # else assume we have a string + if 'before' in year: + return int(year.split('before ')[-1]) + elif '-' in year: + return round(sum(list(map(float, year.split(' ')[-1].split('-'))))/2) + elif 'onwards' in year: + return int(year.split(' onwards')[-2].split(' ')[-1]) + elif year == 'NO DATA!' or year == 'INVALID!': + return None + return int(year) diff --git a/etl/load_epc.sh b/etl/load_epc.sh new file mode 100644 index 000000000..1f8b97776 --- /dev/null +++ b/etl/load_epc.sh @@ -0,0 +1,20 @@ +psql -c "DROP TABLE IF EXISTS epc;" + +# Create EPC data table +## construction_age_band should match date_year in buildings table +## uprn and toid can also be linked to building table +psql -c " +CREATE TABLE epc ( + index integer, + current_energy_rating char(1), + lodgement_date timestamp, + floor_level integer, + construction_age_band smallint, + uprn bigint, + epc_data_from_file varchar, + toid varchar +); +" + +# Read in the EPC data +psql -c "\copy epc FROM 'gla-epc-subset.csv' DELIMITER ',' CSV HEADER;" \ No newline at end of file diff --git a/etl/requirements.txt b/etl/requirements.txt index 551883598..233e13ad8 100644 --- a/etl/requirements.txt +++ b/etl/requirements.txt @@ -5,3 +5,7 @@ psycopg2==2.7.5 shapely==1.7 retrying==1.3.3 requests==2.23.0 +pyarrow +fastparquet +cython +pandas \ No newline at end of file diff --git a/tests/test_epc.py b/tests/test_epc.py new file mode 100644 index 000000000..cd810fed8 --- /dev/null +++ b/tests/test_epc.py @@ -0,0 +1,21 @@ +import pytest +from etl import floor_level_to_int, construction_to_int + + +def test_floor_level_to_int(): + """Test that differently formatted floors can correctly converted.""" + test_levels = ['01', '02', '1st', '2nd', '3rd', '4th', '1', '2', '0', + 'Ground', 'NODATA!', 'mid floor', 'Basement', 'ground floor', '21st or above', + 'top floor', '00', '20+', None, 5] + expected = [1, 2, 1, 2, 3, 4, 1, 2, 0, 0, None, 1, -1, 0, 21, 2, 0, 20, None, 5] + for lvl, ex in zip(test_levels, expected): + assert floor_level_to_int(lvl) == ex + + +def test_construction_to_int(): + """Test that differently formatted construction ages can correctly converted.""" + test_dates = ['England and Wales: before 1900', None, 'England and Wales: 1991-1996', + 'NO DATA!', 'England and Wales: 2007 onwards', 'INVALID!', '1950'] + expected = [1900, None, 1994, None, 2007, None, 1950] + for date, ex in zip(test_dates, expected): + assert construction_to_int(date) == ex \ No newline at end of file