Second-level signatures

Some signature types are furhter subdivided to get a more fine-grain classification. This notebook generates second-level signatures for cluster 9 (city centres) and cluster 2 (residential neighbourhoods).

The method mirrors top-level clustering.

import dask.dataframe
import pandas as pd
import numpy as np

from clustergram import Clustergram
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import urbangrammar_graphics as ugg
import dask_geopandas
from utils.dask_geopandas import dask_dissolve
standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index('hindex')
stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()
CPU times: user 2min 41s, sys: 1min 29s, total: 4min 11s
Wall time: 2min 46s
labels = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")

Sub-cluster cluster 9 - city centres

data9 = data.loc[labels.kmeans10gb == 9]
cgram = Clustergram(range(1, 25), method='kmeans', n_init=1000, random_state=42)
cgram.fit(data9)
K=1 fitted in 138.56104564666748 seconds.
K=2 fitted in 250.19275641441345 seconds.
K=3 fitted in 496.8422074317932 seconds.
K=4 fitted in 516.2055711746216 seconds.
K=5 fitted in 524.4590675830841 seconds.
K=6 fitted in 557.9513728618622 seconds.
K=7 fitted in 614.9245867729187 seconds.
K=8 fitted in 675.608170747757 seconds.
K=9 fitted in 727.5412681102753 seconds.
K=10 fitted in 805.7417376041412 seconds.
K=11 fitted in 864.23623919487 seconds.
K=12 fitted in 940.5778124332428 seconds.
K=13 fitted in 1001.939873456955 seconds.
K=14 fitted in 1127.1891009807587 seconds.
K=15 fitted in 1266.9384486675262 seconds.
K=16 fitted in 1347.8543231487274 seconds.
K=17 fitted in 1421.443199634552 seconds.
K=18 fitted in 1487.2952530384064 seconds.
K=19 fitted in 1551.0777575969696 seconds.
K=20 fitted in 1595.5207443237305 seconds.
K=21 fitted in 1636.6767675876617 seconds.
K=22 fitted in 1664.2299811840057 seconds.
K=23 fitted in 1723.266358613968 seconds.
K=24 fitted in 1786.8603992462158 seconds.
import urbangrammar_graphics as ugg
from bokeh.io import output_notebook
from bokeh.plotting import show

output_notebook()
Loading BokehJS ...
fig = cgram.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
show(fig)
from bokeh.plotting import figure, output_file, save

output_file(filename="../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9.html", title="clustergram_cl9")
save(fig)
'/home/jovyan/work/urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9.html'
cgram.plot(
    figsize=(12, 8),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
<AxesSubplot:xlabel='Number of clusters (k)', ylabel='PCA weighted mean of the clusters'>
../_images/level2_ss_11_1.png
cgram.silhouette_score().plot()
<AxesSubplot:>
../_images/level2_ss_12_1.png
cgram.calinski_harabasz_score().plot()
<AxesSubplot:>
../_images/level2_ss_13_1.png
cgram.davies_bouldin_score().plot()
<AxesSubplot:>
../_images/level2_ss_14_1.png
labels = cgram.labels.copy()
labels.columns = labels.columns.astype("str")  # parquet require str column names
labels.to_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq")
labels = cgram.labels.copy()
labels.index = data9.index
labels
1 2 3 4 5 6 7 8 9 10 ... 15 16 17 18 19 20 21 22 23 24
hindex
c000e107423t0006 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c000e107423t0007 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c000e107423t0011 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c000e107423t0012 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c000e107423t0014 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e622606t0000 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c102e622610t0001 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c102e622621t0000 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c102e622622t0000 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13
c102e622628t0001 0 0 0 0 0 0 0 0 0 0 ... 14 4 10 2 11 6 6 18 22 13

113644 rows × 24 columns

labels[9].value_counts()
0    86380
2    21760
4     3739
1     1390
5      264
7       98
8        8
3        3
6        2
Name: 9, dtype: int64
centres = []
for i in range(103):
    geom = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/tessellation/tess_{i}.pq", columns=["tessellation", "hindex"]).set_index("hindex")
    geom = geom.merge(labels, how="left", left_index=True, right_index=True)
    geom = geom[~geom[2].isna()]
    centres.append(geom)
    print(f"Chunk {i} done.")
Chunk 0 done.
Chunk 1 done.
Chunk 2 done.
Chunk 3 done.
Chunk 4 done.
Chunk 5 done.
Chunk 6 done.
Chunk 7 done.
Chunk 8 done.
Chunk 9 done.
Chunk 10 done.
Chunk 11 done.
Chunk 12 done.
Chunk 13 done.
Chunk 14 done.
Chunk 15 done.
Chunk 16 done.
Chunk 17 done.
Chunk 18 done.
Chunk 19 done.
Chunk 20 done.
Chunk 21 done.
Chunk 22 done.
Chunk 23 done.
Chunk 24 done.
Chunk 25 done.
Chunk 26 done.
Chunk 27 done.
Chunk 28 done.
Chunk 29 done.
Chunk 30 done.
Chunk 31 done.
Chunk 32 done.
Chunk 33 done.
Chunk 34 done.
Chunk 35 done.
Chunk 36 done.
Chunk 37 done.
Chunk 38 done.
Chunk 39 done.
Chunk 40 done.
Chunk 41 done.
Chunk 42 done.
Chunk 43 done.
Chunk 44 done.
Chunk 45 done.
Chunk 46 done.
Chunk 47 done.
Chunk 48 done.
Chunk 49 done.
Chunk 50 done.
Chunk 51 done.
Chunk 52 done.
Chunk 53 done.
Chunk 54 done.
Chunk 55 done.
Chunk 56 done.
Chunk 57 done.
Chunk 58 done.
Chunk 59 done.
Chunk 60 done.
Chunk 61 done.
Chunk 62 done.
Chunk 63 done.
Chunk 64 done.
Chunk 65 done.
Chunk 66 done.
Chunk 67 done.
Chunk 68 done.
Chunk 69 done.
Chunk 70 done.
Chunk 71 done.
Chunk 72 done.
Chunk 73 done.
Chunk 74 done.
Chunk 75 done.
Chunk 76 done.
Chunk 77 done.
Chunk 78 done.
Chunk 79 done.
Chunk 80 done.
Chunk 81 done.
Chunk 82 done.
Chunk 83 done.
Chunk 84 done.
Chunk 85 done.
Chunk 86 done.
Chunk 87 done.
Chunk 88 done.
Chunk 89 done.
Chunk 90 done.
Chunk 91 done.
Chunk 92 done.
Chunk 93 done.
Chunk 94 done.
Chunk 95 done.
Chunk 96 done.
Chunk 97 done.
Chunk 98 done.
Chunk 99 done.
Chunk 100 done.
Chunk 101 done.
Chunk 102 done.
centres = pd.concat(centres)
ddf = dask_geopandas.from_geopandas(centres.reset_index().rename_geometry("geometry"), npartitions=64)
ddf.geometry = ddf.simplify(2).buffer(.001).simplify(2)
%time spsig = ddf.compute()
CPU times: user 28 s, sys: 0 ns, total: 28 s
Wall time: 1.97 s
spsig.columns = spsig.columns.astype("str")  # parquet require str column names
spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster9_all_GB_simplified.pq")

Dissolve signatures

cl_labels = labels[9]
cl_labels.name = 'subclustering_cluster9_k9'
cl_labels = pd.DataFrame(cl_labels)
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import urbangrammar_graphics as ugg
import dask_geopandas
from utils.dask_geopandas import dask_dissolve
import warnings 

warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
centres
tessellation 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
hindex
c000e108672t0004 POLYGON ((353974.785 429026.424, 353976.600 42... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c000e108672t0001 POLYGON ((354010.365 429050.372, 354009.350 42... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c000e108639t0012 POLYGON ((353959.502 429043.658, 353955.858 42... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c000e108639t0001 POLYGON ((353977.093 429056.686, 353975.961 42... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c000e108639t0002 POLYGON ((353956.738 429081.451, 353954.996 42... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e176044t0000 POLYGON ((389299.110 390268.240, 389296.750 39... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c102e176046t0000 POLYGON ((389324.250 390317.000, 389329.880 39... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c102e176047t0000 POLYGON ((389335.500 390318.500, 389335.260 39... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c102e176008t0000 POLYGON ((389628.340 390455.480, 389631.670 39... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0
c102e176032t0000 POLYGON ((389513.950 390446.240, 389511.600 39... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 4.0 10.0 2.0 11.0 6.0 6.0 18.0 22.0 13.0

113644 rows × 25 columns

centres = centres.sort_values(9).rename_geometry("geometry")[[9, "geometry"]]
centres.columns = ["clusters", "geometry"]
ddf = dask_geopandas.from_geopandas(centres, npartitions=64)
spsig = dask_dissolve(ddf, by="clusters").compute().reset_index(drop=True).explode()
spsig
clusters geometry
0 0 0.0 POLYGON Z ((90227.273 17620.071 0.000, 90187.3...
1 0.0 POLYGON Z ((151996.490 40926.350 0.000, 151998...
2 0.0 POLYGON Z ((180573.320 62127.626 0.000, 180573...
3 0.0 POLYGON Z ((180593.580 61881.970 0.000, 180582...
4 0.0 POLYGON Z ((180757.583 61748.508 0.000, 180759...
... ... ... ...
8 3 8.0 POLYGON ((323321.005 463795.416, 323319.842 46...
4 8.0 POLYGON ((325929.840 1008792.061, 325927.377 1...
5 8.0 POLYGON ((337804.770 1013422.583, 337800.122 1...
6 8.0 POLYGON ((422304.270 1147826.990, 422296.000 1...
7 8.0 POLYGON ((525396.260 439215.480, 525360.920 43...

2081 rows × 2 columns

spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster9_k9_GB.pq")
spsig.geometry = spsig.simplify(2).buffer(.001).simplify(2)
spsig.to_file(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster9_k9_GB_simplified.geojson", driver="GeoJSON")
spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster9_k9_GB_GB_simplified.pq")

Sub-cluster cluster 2 - neighbourhoods

labels = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")
data2 = data.loc[labels.kmeans10gb == 2]
cgram2 = Clustergram(range(1, 25), method='kmeans', n_init=100, random_state=42)
cgram2.fit(data2)
K=1 fitted in 74.71555924415588 seconds.
K=2 fitted in 214.04965329170227 seconds.
K=3 fitted in 448.26729440689087 seconds.
K=4 fitted in 566.3618776798248 seconds.
K=5 fitted in 620.1726832389832 seconds.
K=6 fitted in 743.0392332077026 seconds.
K=7 fitted in 914.3954825401306 seconds.
K=8 fitted in 1232.1980152130127 seconds.
K=9 fitted in 1477.7050552368164 seconds.
K=10 fitted in 1512.5710220336914 seconds.
K=11 fitted in 1557.0559606552124 seconds.
K=12 fitted in 1562.9378054141998 seconds.
K=13 fitted in 1698.0964839458466 seconds.
K=14 fitted in 1756.1707797050476 seconds.
K=15 fitted in 1831.6864550113678 seconds.
K=16 fitted in 2088.715784549713 seconds.
K=17 fitted in 2229.236268043518 seconds.
K=18 fitted in 2437.7369673252106 seconds.
K=19 fitted in 2663.076995372772 seconds.
K=20 fitted in 2612.7708687782288 seconds.
K=21 fitted in 2755.6792554855347 seconds.
K=22 fitted in 2960.3728127479553 seconds.
K=23 fitted in 3114.0820450782776 seconds.
K=24 fitted in 3199.2140488624573 seconds.
show(cgram2.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
))