Trial of clustering on OpenMap and MasterMap data¶
This is an explorative notebook comparing simple morphometric clustering of Edinburgh based on OS OpenMap and OS MasterMap building footprints.
Note that since OS MasterMap license does not allow a release of derived open data products, OS MasterMap cannot be used in the final version of spatial signatures anyway. Consider this notebook a sandbox to play at.
import geopandas as gpd
import momepy as mm
mastermap = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh.pq')
openmap = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_om.pq')
mastermap_tess = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_mm_tess.pq')
openmap_tess = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_om_tess.pq')
Get small sample of morphometric data¶
We first measure a small sample of morphometric characters.
mastermap['area'] = mastermap.area
openmap['area'] = openmap.area
openmap_tess['area_t'] = openmap_tess.area
mastermap_tess['area_t'] = mastermap_tess.area
openmap_tess['peri_t'] = openmap_tess.length
mastermap_tess['peri_t'] = mastermap_tess.length
openmap_tess['eri'] = mm.EquivalentRectangularIndex(openmap_tess, 'area_t', 'peri_t').series
mastermap_tess['eri'] = mm.EquivalentRectangularIndex(mastermap_tess, 'area_t', 'peri_t').series
openmap_tess['car'] = mm.AreaRatio(openmap_tess, openmap, 'area_t', 'area', 'uID').series
mastermap_tess['car'] = mm.AreaRatio(mastermap_tess, mastermap, 'area_t', 'area', 'uID').series
mastermap_tess['xID'] = range(len(mastermap_tess))
openmap_tess['xID'] = range(len(openmap_tess))
openmap = openmap.merge(openmap_tess[['uID', 'xID']], on='uID', how='left')
mastermap = mastermap.merge(mastermap_tess[['uID', 'xID']], on='uID', how='left')
from libpysal.weights import Queen
om_w1 = Queen.from_dataframe(openmap_tess, idVariable='xID')
mm_w1 = Queen.from_dataframe(mastermap_tess, idVariable='xID')
/opt/conda/lib/python3.7/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected:
There are 6 disconnected components.
There are 5 islands with ids: 2497, 16639, 54931, 54961, 98954.
warnings.warn(message)
om_w3 = mm.sw_high(k=3, weights=om_w1)
mm_w3 = mm.sw_high(k=3, weights=mm_w1)
openmap_tess.columns
Index(['uID', 'geometry', 'eID', 'area_t', 'peri_t', 'eri', 'car', 'xID'], dtype='object')
Let’s get some contextual information to use as a clustering input. In this case, interquartile range of each character within 3 topological steps on tessellation.
openmap_tess['area_t_IQR_3steps'] = mm.Range(openmap_tess, 'area_t', om_w3, 'xID', rng=(25, 75)).series
100%|██████████| 74736/74736 [00:46<00:00, 1594.96it/s]
openmap_tess['peri_t_IQR_3steps'] = mm.Range(openmap_tess, 'peri_t', om_w3, 'xID', rng=(25, 75)).series
openmap_tess['eri_IQR_3steps'] = mm.Range(openmap_tess, 'eri', om_w3, 'xID', rng=(25, 75)).series
openmap_tess['car_IQR_3steps'] = mm.Range(openmap_tess, 'car', om_w3, 'xID', rng=(25, 75)).series
100%|██████████| 74736/74736 [00:46<00:00, 1600.21it/s]
100%|██████████| 74736/74736 [00:46<00:00, 1600.31it/s]
100%|██████████| 74736/74736 [00:46<00:00, 1591.84it/s]
mastermap_tess['area_t_IQR_3steps'] = mm.Range(mastermap_tess, 'area_t', mm_w3, 'xID', rng=(25, 75)).series
mastermap_tess['peri_t_IQR_3steps'] = mm.Range(mastermap_tess, 'peri_t', mm_w3, 'xID', rng=(25, 75)).series
mastermap_tess['eri_IQR_3steps'] = mm.Range(mastermap_tess, 'eri', mm_w3, 'xID', rng=(25, 75)).series
mastermap_tess['car_IQR_3steps'] = mm.Range(mastermap_tess, 'car', mm_w3, 'xID', rng=(25, 75)).series
100%|██████████| 234373/234373 [02:26<00:00, 1599.08it/s]
100%|██████████| 234373/234373 [02:26<00:00, 1599.26it/s]
100%|██████████| 234373/234373 [02:26<00:00, 1601.02it/s]
100%|██████████| 234373/234373 [02:26<00:00, 1599.36it/s]
K-means clustering¶
To keep the comparison simple, we’ll use k-means with few arbitrary k
options.
OpenMap¶
from sklearn import preprocessing
from sklearn.cluster import KMeans
Before clustering, we need to normalize data.
scaler = preprocessing.StandardScaler()
cols = [col for col in openmap_tess.columns if 'IQR' in col]
cols
['area_t_IQR_3steps', 'peri_t_IQR_3steps', 'eri_IQR_3steps', 'car_IQR_3steps']
om_data = scaler.fit_transform(openmap_tess[cols].fillna(0))
kmeans = KMeans(n_clusters=5, random_state=0).fit(om_data)
openmap_tess['cluster'] = kmeans.labels_
openmap_tess['centroid'] = openmap_tess.centroid
openmap_tess.set_geometry('centroid').plot('cluster', categorical=True, figsize=(18, 18), legend=True, markersize=.2)
<AxesSubplot:>
om_data = scaler.fit_transform(openmap_tess[cols].fillna(0))
kmeans = KMeans(n_clusters=2, random_state=0).fit(om_data)
openmap_tess['cluster2'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('cluster2', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Set1')
<AxesSubplot:>
kmeans = KMeans(n_clusters=3, random_state=0).fit(om_data)
openmap_tess['cluster3'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('cluster3', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
kmeans = KMeans(n_clusters=4, random_state=0).fit(om_data)
openmap_tess['cluster4'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('cluster4', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
kmeans = KMeans(n_clusters=8, random_state=0).fit(om_data)
openmap_tess['cluster8'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('cluster8', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
|### MasterMap
And the same for Master Map.
om_data = scaler.fit_transform(mastermap_tess[cols].fillna(0))
kmeans = KMeans(n_clusters=5, random_state=0).fit(om_data)
mastermap_tess['cluster'] = kmeans.labels_
mastermap_tess['centroid'] = mastermap_tess.centroid
mastermap_tess.set_geometry('centroid').plot('cluster', categorical=True, figsize=(18, 18), legend=True, markersize=.2)
<AxesSubplot:>
kmeans = KMeans(n_clusters=2, random_state=0).fit(om_data)
mastermap_tess['cluster2'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('cluster2', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Set1')
<AxesSubplot:>
kmeans = KMeans(n_clusters=3, random_state=0).fit(om_data)
mastermap_tess['cluster3'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('cluster3', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
kmeans = KMeans(n_clusters=4, random_state=0).fit(om_data)
mastermap_tess['cluster4'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('cluster4', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Set1')
<AxesSubplot:>
kmeans = KMeans(n_clusters=8, random_state=0).fit(om_data)
mastermap_tess['cluster8'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('cluster8', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
mastermap_tess.to_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_mm_tess_clus.pq')
openmap_tess.to_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_om_tess_clus.pq')
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata. This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec
This metadata specification does not yet make stability promises. We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.
To further ignore this warning, you can do:
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
"""Entry point for launching an IPython kernel.
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata. This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec
This metadata specification does not yet make stability promises. We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.
To further ignore this warning, you can do:
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
MasterMap tessellation is more granular and resulting clusters are less geographically defined. OpenMap ones are easier to interpret and understand.
Using Distance Band¶
We can try the same with 500m distance band instead of 3 topological steps.
mastermap_tess = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_mm_tess_clus.pq')
openmap_tess = gpd.read_parquet('../../urbangrammar_samba/OS_MasterMap_Buildings_sample/edinburgh_om_tess_clus.pq')
mastermap_tess
uID | geometry | eID | area_t | peri_t | eri | car | xID | area_t_IQR_3steps | peri_t_IQR_3steps | eri_IQR_3steps | car_IQR_3steps | area | cluster | centroid | cluster2 | cluster3 | cluster4 | cluster8 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 157286.0 | POLYGON ((323326.909 676984.873, 323325.867 67... | 2 | 873.804255 | 225.934777 | 0.562063 | 0.180165 | 0 | 666.197748 | 100.129941 | 0.371123 | 0.079977 | 157.428753 | 0 | POINT (323326.775 676992.956) | 0 | 1 | 3 | 6 |
1 | 157303.0 | POLYGON ((323475.057 676961.525, 323473.298 67... | 2 | 802.289855 | 176.174746 | 0.665618 | 0.198437 | 1 | 350.832797 | 47.721229 | 0.106494 | 0.090254 | 159.203951 | 2 | POINT (323484.885 676978.164) | 0 | 0 | 1 | 2 |
2 | 157304.0 | POLYGON ((323471.159 676973.522, 323472.390 67... | 2 | 724.075620 | 170.645266 | 0.673583 | 0.221255 | 2 | 433.027807 | 53.880101 | 0.097254 | 0.088632 | 160.205504 | 2 | POINT (323463.722 676979.828) | 0 | 0 | 1 | 2 |
3 | 157305.0 | POLYGON ((323432.776 676953.938, 323430.273 67... | 2 | 1439.420846 | 214.100873 | 0.727294 | 0.130804 | 3 | 480.652087 | 49.794723 | 0.111968 | 0.071090 | 188.282499 | 2 | POINT (323442.141 676990.387) | 0 | 0 | 1 | 2 |
4 | 157306.0 | POLYGON ((323414.241 676950.610, 323411.606 67... | 2 | 1123.509438 | 241.121390 | 0.588882 | 0.166086 | 4 | 468.249654 | 44.169215 | 0.099367 | 0.057433 | 186.598752 | 2 | POINT (323418.964 676980.755) | 0 | 0 | 1 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
234632 | NaN | POLYGON ((334889.340 661707.240, 334904.310 66... | 4411 | 598.206900 | 105.744247 | 1.045748 | NaN | 234368 | 647.961908 | 74.645377 | 0.240902 | NaN | NaN | 0 | POINT (334908.250 661691.303) | 0 | 0 | 1 | 0 |
234633 | NaN | POLYGON ((334959.150 671384.620, 334954.920 67... | 4412 | 335.125800 | 80.540303 | 0.946900 | NaN | 234369 | 1994.516597 | 118.467083 | 0.159378 | NaN | NaN | 0 | POINT (334946.846 671370.363) | 0 | 0 | 1 | 0 |
234634 | NaN | POLYGON ((334961.130 665328.300, 334975.440 66... | 4413 | 268.215000 | 66.224449 | 0.990422 | NaN | 234370 | 639.680357 | 132.029742 | 0.243493 | NaN | NaN | 0 | POINT (334974.263 665328.501) | 0 | 0 | 1 | 0 |
234635 | NaN | POLYGON ((335051.250 661914.606, 335054.630 66... | 4414 | 10.646340 | 15.569358 | 0.840680 | NaN | 234371 | 801.079377 | 89.024253 | 0.145541 | NaN | NaN | 2 | POINT (335052.377 661911.624) | 0 | 0 | 1 | 0 |
234636 | NaN | POLYGON ((317295.320 673640.330, 317301.780 67... | 4415 | 60.414850 | 37.445401 | 0.830310 | NaN | 234372 | 43034.147237 | 854.079116 | 0.111146 | NaN | NaN | 3 | POINT (317303.187 673642.388) | 1 | 2 | 0 | 3 |
234637 rows × 19 columns
from libpysal.weights import DistanceBand
# Generating new ID as the xID got messed up along the way. ehm.
openmap_tess['dbID'] = range(len(openmap_tess))
om500 = DistanceBand.from_dataframe(openmap_tess, 500, geom_col='centroid', ids='dbID')
/opt/conda/lib/python3.7/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected:
There are 70 disconnected components.
There are 57 islands with ids: 98, 284, 285, 287, 288, 578, 579, 719, 725, 726, 743, 744, 756, 787, 797, 805, 825, 845, 8778, 22150, 22162, 22169, 22175, 22191, 22192, 22210, 22213, 22223, 22224, 22237, 22241, 22242, 22259, 22268, 22282, 22285, 22290, 22292, 22294, 22295, 22296, 22297, 22298, 22299, 22300, 22301, 22305, 22306, 22310, 25018, 25038, 25054, 25065, 25092, 25150, 55446, 73772.
warnings.warn(message)
# Beware, this requires a ton of memory to compute
mastermap_tess['dbID'] = range(len(mastermap_tess))
mm500 = DistanceBand.from_dataframe(mastermap_tess, 500, geom_col='centroid', ids='dbID')
/opt/conda/lib/python3.7/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected:
There are 56 disconnected components.
There are 46 islands with ids: 449, 558, 1211, 1214, 1215, 1217, 2354, 2492, 2875, 2885, 2908, 2909, 2920, 2933, 2946, 2977, 3073, 3234, 3250, 74272, 74276, 74292, 74302, 74304, 74310, 74325, 74350, 74361, 74363, 74364, 74368, 74369, 74428, 74442, 74547, 74592, 74603, 74645, 74646, 82031, 82096, 82174, 82237, 196625, 229787, 233749.
warnings.warn(message)
openmap_tess['area_t_IQR_500'] = mm.Range(openmap_tess, 'area_t', om500, 'dbID', rng=(25, 75)).series
openmap_tess['peri_t_IQR_500'] = mm.Range(openmap_tess, 'peri_t', om500, 'dbID', rng=(25, 75)).series
openmap_tess['eri_IQR_500'] = mm.Range(openmap_tess, 'eri', om500, 'dbID', rng=(25, 75)).series
openmap_tess['car_IQR_500'] = mm.Range(openmap_tess, 'car', om500, 'dbID', rng=(25, 75)).series
100%|██████████| 74884/74884 [00:56<00:00, 1331.96it/s]
100%|██████████| 74884/74884 [00:56<00:00, 1331.43it/s]
100%|██████████| 74884/74884 [00:56<00:00, 1333.95it/s]
100%|██████████| 74884/74884 [00:56<00:00, 1328.20it/s]
mastermap_tess['area_t_IQR_500'] = mm.Range(mastermap_tess, 'area_t', mm500, 'dbID', rng=(25, 75)).series
mastermap_tess['peri_t_IQR_500'] = mm.Range(mastermap_tess, 'peri_t', mm500, 'dbID', rng=(25, 75)).series
mastermap_tess['eri_IQR_500'] = mm.Range(mastermap_tess, 'eri', mm500, 'dbID', rng=(25, 75)).series
mastermap_tess['car_IQR_500'] = mm.Range(mastermap_tess, 'car', mm500, 'dbID', rng=(25, 75)).series
100%|██████████| 234637/234637 [04:18<00:00, 906.35it/s]
100%|██████████| 234637/234637 [04:22<00:00, 894.79it/s]
100%|██████████| 234637/234637 [04:21<00:00, 897.58it/s]
100%|██████████| 234637/234637 [04:23<00:00, 889.07it/s]
OpenMap distance K means¶
cols = [col for col in openmap_tess.columns if 'IQR_500' in col]
data = scaler.fit_transform(openmap_tess[cols].fillna(0))
kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
openmap_tess['dist_cluster2'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('dist_cluster2', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Set1')
<AxesSubplot:>
kmeans = KMeans(n_clusters=5, random_state=0).fit(data)
openmap_tess['dist_cluster5'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('dist_cluster5', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
kmeans = KMeans(n_clusters=8, random_state=0).fit(data)
openmap_tess['dist_cluster8'] = kmeans.labels_
openmap_tess.set_geometry('centroid').plot('dist_cluster8', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
MasterMap distance K means¶
data = scaler.fit_transform(mastermap_tess[cols].fillna(0))
kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
mastermap_tess['dist_cluster2'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('dist_cluster2', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Set1')
<AxesSubplot:>
kmeans = KMeans(n_clusters=5, random_state=0).fit(data)
mastermap_tess['dist_cluster5'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('dist_cluster5', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
kmeans = KMeans(n_clusters=8, random_state=0).fit(data)
mastermap_tess['dist_cluster8'] = kmeans.labels_
mastermap_tess.set_geometry('centroid').plot('dist_cluster8', categorical=True, figsize=(18, 18), legend=True, markersize=.2, cmap='Accent')
<AxesSubplot:>
Again, OpenMap results seem to be easier to understand from small to large k
. However, MasterMap results for k=8
look quite reasonable as well. Not that it matters much :).