Second-level form signatures

Some signature types are furhter subdivided to get a more fine-grain classification. This notebook generates second-level signatures for cluster 4 (city centres) and cluster 2 (residential neighbourhoods).

The method mirrors top-level clustering.

import dask.dataframe
import pandas as pd
import numpy as np

from clustergram import Clustergram
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import urbangrammar_graphics as ugg
import dask_geopandas
from utils.dask_geopandas import dask_dissolve
data = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index('hindex')
data = data.replace([np.inf, -np.inf], np.nan).fillna(0)
%time data = data.compute()
data
CPU times: user 1min 7s, sys: 42.8 s, total: 1min 49s
Wall time: 1min 50s
sdbAre_q1 sdbAre_q2 sdbAre_q3 sdbPer_q1 sdbPer_q2 sdbPer_q3 sdbCoA_q1 sdbCoA_q2 sdbCoA_q3 ssbCCo_q1 ... lseCWA_q3 lteOri_q1 lteOri_q2 lteOri_q3 lteWNB_q1 lteWNB_q2 lteWNB_q3 lieWCe_q1 lieWCe_q2 lieWCe_q3
hindex
c000e094707t0000 -0.947406 -0.371977 0.020285 -0.901199 -0.237045 -0.023143 -0.000419 -0.001515 -0.010221 -0.046170 ... 0.073064 0.031571 0.196520 0.424415 -0.611681 -0.502087 -0.311160 -0.022562 -0.002733 -0.004738
c000e094763t0000 -0.913567 -0.420861 -0.271703 -0.903627 -0.428003 -0.336729 -0.000419 -0.001515 -0.010221 -0.035325 ... -0.254395 -1.016655 -0.161371 0.011093 0.148709 0.214520 -0.125943 -0.019044 -0.002654 -0.004738
c000e094763t0001 -0.878137 -0.411587 -0.284021 -0.900393 -0.416250 -0.350010 -0.000419 -0.001515 -0.010221 -0.034917 ... -0.285074 -0.897625 -0.161371 0.103698 0.148709 0.250036 -0.200268 -0.019323 -0.002654 -0.004738
c000e094763t0002 -0.952475 -0.421566 -0.283919 -0.968400 -0.429947 -0.343165 -0.000419 -0.001515 -0.010221 -0.065649 ... -0.280682 -0.927382 -0.161371 0.177408 0.302689 0.250036 -0.120718 -0.018825 -0.002654 -0.004738
c000e094764t0000 -0.964878 -0.420861 -0.271703 -0.972440 -0.420006 -0.315861 -0.000419 -0.001515 -0.010221 -0.066832 ... -0.280682 -1.016655 -0.104108 0.185348 0.302689 0.250036 -0.105044 -0.018825 -0.002659 -0.004738
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e644989t0111 -0.311466 -0.431706 -0.373463 -0.082269 -0.459270 -0.389532 -0.000419 -0.001515 -0.010221 0.132837 ... 0.199797 1.288365 0.440493 0.197816 -0.351250 -0.580453 -0.704637 -0.000200 -0.002667 -0.004738
c102e644989t0112 -0.326671 -0.461825 -0.371855 -0.149873 -0.528701 -0.386678 -0.000419 -0.001515 -0.010221 0.136559 ... 0.195101 1.288365 0.440493 0.290197 -0.351250 -0.580453 -0.693005 -0.000669 -0.002667 -0.004738
c102e644989t0113 -0.094236 -0.364761 -0.304254 0.024972 -0.347371 -0.283669 -0.000419 -0.001515 -0.010221 0.021411 ... 0.199797 1.288365 0.440493 0.197816 -0.351250 -0.580453 -0.704637 -0.000843 -0.002667 -0.004738
c102e644989t0114 -0.477667 -0.568464 -0.390033 -0.600170 -0.646516 -0.472676 -0.000419 -0.001515 -0.010221 0.424887 ... 0.008079 -0.855901 0.189420 -0.560233 -0.341815 -0.580453 -0.704637 -0.000898 -0.002667 -0.004738
c102e644989t0115 -0.413094 -0.545952 -0.382834 -0.400108 -0.610332 -0.440413 -0.000419 -0.001515 -0.010221 0.160613 ... 0.008079 -0.203827 0.440493 0.231812 -0.351250 -0.580453 -0.638809 -0.000200 -0.002543 -0.004738

14539578 rows × 177 columns

labels = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/k8_form_labels.pq")
labels
k8
hindex
c000e094707t0000 0
c000e094763t0000 5
c000e094763t0001 5
c000e094763t0002 5
c000e094764t0000 5
... ...
c102e644989t0111 5
c102e644989t0112 5
c102e644989t0113 5
c102e644989t0114 5
c102e644989t0115 5

14539578 rows × 1 columns

Sub-cluster cluster 4 - city centres

data4 = data.loc[labels.k8 == 4]
data4
sdbAre_q1 sdbAre_q2 sdbAre_q3 sdbPer_q1 sdbPer_q2 sdbPer_q3 sdbCoA_q1 sdbCoA_q2 sdbCoA_q3 ssbCCo_q1 ... lseCWA_q3 lteOri_q1 lteOri_q2 lteOri_q3 lteWNB_q1 lteWNB_q2 lteWNB_q3 lieWCe_q1 lieWCe_q2 lieWCe_q3
hindex
c000e107372t0002 1.997072 3.173554 3.431442 2.381685 3.646422 3.669080 -0.000419 -0.001515 -0.010221 -1.996404 ... -0.522584 0.983963 0.745832 0.837519 4.797356 5.578383 4.357864 0.014891 -0.002173 -0.004738
c000e107372t0010 2.164604 3.308433 3.511512 2.585743 3.829118 3.713482 -0.000419 -0.001515 -0.010221 -2.109033 ... -0.520422 0.798750 0.731379 0.837519 4.717725 5.578383 4.357864 0.015162 -0.002176 -0.004738
c000e107372t0012 2.729738 3.553431 4.137457 3.176768 4.265299 3.745213 -0.000419 -0.001515 -0.010221 -2.170382 ... -0.524478 0.798750 0.693735 0.747528 4.797356 5.567115 4.155360 0.015399 -0.002143 -0.004738
c000e107372t0013 1.967667 3.173554 3.443331 2.354584 3.651031 3.680568 -0.000419 -0.001515 -0.010221 -2.083765 ... -0.523554 0.983963 0.715065 0.837519 4.797356 5.578383 4.357864 0.014891 -0.002170 -0.004738
c000e107422t0000 2.462707 3.207867 3.417761 2.863785 3.795187 3.534562 -0.000419 -0.001515 -0.010221 -2.121805 ... -0.522584 0.754407 0.693735 0.687751 4.797356 5.578383 4.155360 0.011516 -0.002191 -0.004738
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e622622t0000 3.318605 4.274405 3.738178 3.395862 4.643282 3.753382 -0.000419 -0.001515 -0.010221 -2.073967 ... -0.424137 -0.139501 -0.173680 0.013847 1.364289 1.089578 1.749723 -0.019645 -0.002505 -0.004738
c102e622625t0000 2.522595 3.932338 3.591517 2.831547 4.471648 3.728486 -0.000419 -0.001515 -0.010221 -1.985255 ... -0.409997 -0.139501 -0.177055 0.125136 1.401938 0.946820 1.582395 -0.019810 -0.002618 -0.004738
c102e622626t0000 2.526626 3.827721 3.464678 2.825567 4.322319 3.620417 -0.000419 -0.001515 -0.010221 -1.998828 ... -0.409997 -0.139501 -0.182284 0.017117 1.401938 0.972979 1.588049 -0.019810 -0.002549 -0.004738
c102e622628t0000 2.534710 3.934916 3.464678 2.911231 4.408819 3.611973 -0.000419 -0.001515 -0.010221 -2.073967 ... -0.409997 -0.139501 -0.186138 -0.008321 1.401938 1.000942 1.704424 -0.019810 -0.002612 -0.004738
c102e622628t0001 3.476486 4.774477 4.109019 3.536899 5.282891 4.110794 -0.000419 -0.001515 -0.010221 -2.078704 ... -0.409997 -0.139501 -0.173680 0.020030 1.401938 1.000942 1.739099 -0.019810 -0.002612 -0.004738

151529 rows × 177 columns

cgram = Clustergram(range(1, 25), method='kmeans', n_init=1000, random_state=42)
cgram.fit(data4)
K=1 skipped. Mean computed from data directly.
K=2 fitted in 223.70374631881714 seconds.
K=3 fitted in 333.39103651046753 seconds.
K=4 fitted in 437.369651556015 seconds.
K=5 fitted in 458.2662000656128 seconds.
K=6 fitted in 452.7728068828583 seconds.
K=7 fitted in 518.857358455658 seconds.
K=8 fitted in 587.6459379196167 seconds.
K=9 fitted in 653.4005665779114 seconds.
K=10 fitted in 678.1721639633179 seconds.
K=11 fitted in 711.1682510375977 seconds.
K=12 fitted in 751.6350464820862 seconds.
K=13 fitted in 807.3923723697662 seconds.
K=14 fitted in 853.3511779308319 seconds.
K=15 fitted in 932.3701865673065 seconds.
K=16 fitted in 995.8812532424927 seconds.
K=17 fitted in 1108.2942202091217 seconds.
K=18 fitted in 1234.5389964580536 seconds.
K=19 fitted in 1337.2205052375793 seconds.
K=20 fitted in 1443.92027926445 seconds.
K=21 fitted in 1547.2303636074066 seconds.
K=22 fitted in 1629.5858244895935 seconds.
K=23 fitted in 1661.619954109192 seconds.
K=24 fitted in 1744.3388550281525 seconds.
import urbangrammar_graphics as ugg
from bokeh.io import output_notebook
from bokeh.plotting import show

output_notebook()
Loading BokehJS ...
fig = cgram.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
show(fig)
cgram.plot(
    figsize=(12, 8),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
<AxesSubplot:xlabel='Number of clusters (k)', ylabel='PCA weighted mean of the clusters'>
../_images/leve2_form_12_1.png
cgram.silhouette_score().plot()
<AxesSubplot:>
../_images/leve2_form_13_1.png
cgram.calinski_harabasz_score().plot()
<AxesSubplot:>
../_images/leve2_form_14_1.png
cgram.davies_bouldin_score().plot()
<AxesSubplot:>
../_images/leve2_form_15_1.png
labels = cgram.labels.copy()
labels.columns = labels.columns.astype("str")  # parquet require str column names
labels.to_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c4_form_labels.pq")
import pickle

with open("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c4_form_centers.pickle",'wb') as f:
    pickle.dump(cgram.cluster_centers, f)
labels = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c4_form_labels.pq")
labels.index = data4.index
centres = []
for i in range(103):
    geom = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/tessellation/tess_{i}.pq", columns=["tessellation", "hindex"]).set_index("hindex")
    geom = geom.merge(labels, how="inner", left_index=True, right_index=True)
    centres.append(geom)
    print(f"Chunk {i} done.")
Chunk 0 done.
Chunk 1 done.
Chunk 2 done.
Chunk 3 done.
Chunk 4 done.
Chunk 5 done.
Chunk 6 done.
Chunk 7 done.
Chunk 8 done.
Chunk 9 done.
Chunk 10 done.
Chunk 11 done.
Chunk 12 done.
Chunk 13 done.
Chunk 14 done.
Chunk 15 done.
Chunk 16 done.
Chunk 17 done.
Chunk 18 done.
Chunk 19 done.
Chunk 20 done.
Chunk 21 done.
Chunk 22 done.
Chunk 23 done.
Chunk 24 done.
Chunk 25 done.
Chunk 26 done.
Chunk 27 done.
Chunk 28 done.
Chunk 29 done.
Chunk 30 done.
Chunk 31 done.
Chunk 32 done.
Chunk 33 done.
Chunk 34 done.
Chunk 35 done.
Chunk 36 done.
Chunk 37 done.
Chunk 38 done.
Chunk 39 done.
Chunk 40 done.
Chunk 41 done.
Chunk 42 done.
Chunk 43 done.
Chunk 44 done.
Chunk 45 done.
Chunk 46 done.
Chunk 47 done.
Chunk 48 done.
Chunk 49 done.
Chunk 50 done.
Chunk 51 done.
Chunk 52 done.
Chunk 53 done.
Chunk 54 done.
Chunk 55 done.
Chunk 56 done.
Chunk 57 done.
Chunk 58 done.
Chunk 59 done.
Chunk 60 done.
Chunk 61 done.
Chunk 62 done.
Chunk 63 done.
Chunk 64 done.
Chunk 65 done.
Chunk 66 done.
Chunk 67 done.
Chunk 68 done.
Chunk 69 done.
Chunk 70 done.
Chunk 71 done.
Chunk 72 done.
Chunk 73 done.
Chunk 74 done.
Chunk 75 done.
Chunk 76 done.
Chunk 77 done.
Chunk 78 done.
Chunk 79 done.
Chunk 80 done.
Chunk 81 done.
Chunk 82 done.
Chunk 83 done.
Chunk 84 done.
Chunk 85 done.
Chunk 86 done.
Chunk 87 done.
Chunk 88 done.
Chunk 89 done.
Chunk 90 done.
Chunk 91 done.
Chunk 92 done.
Chunk 93 done.
Chunk 94 done.
Chunk 95 done.
Chunk 96 done.
Chunk 97 done.
Chunk 98 done.
Chunk 99 done.
Chunk 100 done.
Chunk 101 done.
Chunk 102 done.
centres = pd.concat(centres)
centres
tessellation 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
hindex
c000e603107t0010 POLYGON Z ((353104.312 428953.500 0.000, 35310... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 10
c000e603107t0005 POLYGON Z ((353161.950 428861.163 0.000, 35316... 0 0 1 0 0 0 1 0 0 ... 6 14 16 2 8 14 17 0 20 10
c000e603107t0002 POLYGON Z ((353164.871 428863.388 0.000, 35316... 0 0 1 0 0 0 1 0 0 ... 6 14 16 2 8 14 17 0 20 10
c000e603107t0001 POLYGON Z ((353228.443 428887.142 0.000, 35322... 0 0 1 0 0 0 1 0 0 ... 6 14 16 2 8 14 17 0 20 10
c000e603107t0046 POLYGON Z ((353199.359 428905.815 0.000, 35319... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 10
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e176047t0000 POLYGON ((389335.500 390318.500, 389335.260 39... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 13
c102e622611t0000 POLYGON Z ((389310.715 390333.480 0.000, 38930... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 13
c102e176008t0000 POLYGON ((389628.340 390455.480, 389631.670 39... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 13
c102e176032t0000 POLYGON ((389513.950 390446.240, 389511.600 39... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 13
c102e176009t0000 POLYGON ((389697.000 390453.000, 389684.500 39... 0 0 1 0 0 0 1 0 0 ... 0 14 16 2 8 14 17 0 20 13

151529 rows × 25 columns

import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import urbangrammar_graphics as ugg
import dask_geopandas
from utils.dask_geopandas import dask_dissolve
import warnings 

warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
centres = centres.sort_values("9").rename_geometry("geometry")[["9", "geometry"]]
centres.columns = ["clusters", "geometry"]
ddf = dask_geopandas.from_geopandas(centres, npartitions=64)
spsig = dask_dissolve(ddf, by="clusters").compute().reset_index(drop=True).explode()
spsig
clusters geometry
0 0 0 POLYGON Z ((561163.004 98706.648 0.000, 561162...
1 0 POLYGON Z ((561058.798 98901.493 0.000, 561058...
2 0 POLYGON Z ((561094.424 98999.714 0.000, 561094...
3 0 POLYGON Z ((561375.250 98668.646 0.000, 561373...
4 0 POLYGON Z ((581839.270 109316.360 0.000, 58183...
... ... ... ...
8 4 8 POLYGON ((525360.920 439284.060, 525426.290 43...
5 8 POLYGON ((325927.377 1008792.072, 325925.002 1...
6 8 POLYGON ((337778.576 1013435.780, 337547.856 1...
7 8 POLYGON ((422300.000 1147850.000, 422360.000 1...
8 8 POLYGON ((175943.494 878538.992, 175944.881 87...

2954 rows × 2 columns

spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster4_k9_form_GB.pq")
spsig.geometry = spsig.simplify(2).buffer(.001).simplify(2)
spsig.to_file(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster4_k9_form_GB_simplified.geojson", driver="GeoJSON")
spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster4_k9_form_GB_simplified.pq")
spsig.to_file(f"sub_signatures_cluster4_k9_form_GB_simplified.gpkg", driver="GPKG")

Sub-cluster cluster 2

data2 = data.loc[labels.k8 == 2]
data2
sdbAre_q1 sdbAre_q2 sdbAre_q3 sdbPer_q1 sdbPer_q2 sdbPer_q3 sdbCoA_q1 sdbCoA_q2 sdbCoA_q3 ssbCCo_q1 ... lseCWA_q3 lteOri_q1 lteOri_q2 lteOri_q3 lteWNB_q1 lteWNB_q2 lteWNB_q3 lieWCe_q1 lieWCe_q2 lieWCe_q3
hindex
c000e097916t0000 0.985252 0.392311 0.239016 1.017116 0.263269 0.190228 -0.000419 -0.001515 -0.010221 -0.007430 ... -0.491403 -0.736080 -1.117002 -0.704494 5.453053 4.136105 3.871857 0.011462 -0.002189 -0.004738
c000e097916t0001 0.986698 0.383377 0.202023 1.022926 0.259483 0.151448 -0.000419 -0.001515 -0.010221 0.004062 ... -0.491403 -0.736080 -1.117002 -0.704494 5.453053 4.136105 3.790504 0.011462 -0.002189 -0.004738
c000e097916t0002 0.877057 0.295106 0.065275 0.930232 0.181706 0.080218 -0.000419 -0.001515 -0.010221 -0.168096 ... -0.484267 -0.692283 -1.129979 -1.485165 5.602077 4.044032 3.396760 0.011051 -0.002180 -0.004738
c000e097916t0003 0.875002 0.293485 0.048611 0.922951 0.175053 0.069451 -0.000419 -0.001515 -0.010221 -0.158833 ... -0.484267 -0.692283 -1.129979 -1.485165 5.602077 4.044032 3.396760 0.010804 -0.002180 -0.004738
c000e097916t0004 0.870884 0.273456 0.077266 0.906541 0.175053 0.079372 -0.000419 -0.001515 -0.010221 -0.007396 ... -0.484267 -0.692283 -1.117002 -1.064872 5.453053 3.911090 3.396760 0.011462 -0.002180 -0.004738
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e639768t0175 1.536862 0.925914 0.440663 1.422308 0.940510 0.395441 -0.000419 -0.001515 -0.010221 0.564866 ... -0.361822 -1.536735 -0.651994 -0.313730 1.583308 0.495236 -0.008730 -0.010412 -0.002268 -0.004738
c102e639768t0181 1.488185 0.863399 0.424774 1.405489 0.771097 0.383649 -0.000419 -0.001515 -0.010221 0.474682 ... -0.378234 -1.401278 -0.347396 -0.313730 1.491510 0.495236 0.038071 -0.010412 -0.002375 -0.004738
c102e639768t0187 1.482975 0.897458 0.532788 1.394022 0.894187 0.456136 -0.000419 -0.001515 -0.010221 0.428325 ... -0.378234 -1.401278 -0.465302 -0.313730 1.507224 0.495236 0.038071 -0.010412 -0.002375 -0.004738
c102e639768t0188 1.428308 0.873805 0.472334 1.355474 0.821399 0.394042 -0.000419 -0.001515 -0.010221 0.426544 ... -0.378234 -1.354317 -0.145154 -0.258014 0.933151 0.495236 0.038071 -0.010287 -0.002387 -0.004738
c102e639768t0191 1.488643 0.887393 0.453780 1.413642 0.797336 0.386862 -0.000419 -0.001515 -0.010221 0.432959 ... -0.378234 -1.373797 -0.145154 -0.258014 0.933151 0.495236 0.038071 -0.010287 -0.002387 -0.004738

3451451 rows × 177 columns

cgram2 = Clustergram(range(1, 25), method='minibatchkmeans', n_init=1000, batch_size=250_000, random_state=42)
cgram2.fit(data2)
K=1 skipped. Mean computed from data directly.
K=2 fitted in 1253.84738779068 seconds.
K=4 fitted in 1489.357971906662 seconds.
K=5 fitted in 1604.8212933540344 seconds.
K=6 fitted in 2255.956408262253 seconds.
K=7 fitted in 2376.1415071487427 seconds.
K=8 fitted in 2564.8916664123535 seconds.
K=9 fitted in 2696.464795589447 seconds.
K=10 fitted in 2815.596691131592 seconds.
K=11 fitted in 2937.35027718544 seconds.
K=12 fitted in 3052.864670753479 seconds.
K=13 fitted in 3176.3278625011444 seconds.
K=14 fitted in 3267.506118297577 seconds.
K=15 fitted in 3378.229498386383 seconds.
K=16 fitted in 3595.2608511447906 seconds.
K=17 fitted in 3743.807673215866 seconds.
K=18 fitted in 3869.714512348175 seconds.
K=19 fitted in 3994.5937445163727 seconds.
K=20 fitted in 4112.138377189636 seconds.
K=21 fitted in 4360.403252840042 seconds.
K=22 fitted in 4497.992453098297 seconds.
K=23 fitted in 4609.6327838897705 seconds.
K=24 fitted in 4723.576930522919 seconds.
import urbangrammar_graphics as ugg
from bokeh.io import output_notebook
from bokeh.plotting import show

output_notebook()
Loading BokehJS ...
fig = cgram2.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
show(fig)
fig = cgram2.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2,
    pca_weighted=False,
)
show(fig)
cgram2.plot(
    figsize=(12, 8),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
    size=.2
)
<AxesSubplot:xlabel='Number of clusters (k)', ylabel='PCA weighted mean of the clusters'>
../_images/leve2_form_34_1.png
cgram2.silhouette_score(sample_size=100_000).plot()
<AxesSubplot:>
../_images/leve2_form_35_1.png
cgram2.calinski_harabasz_score().plot()
<AxesSubplot:>
../_images/leve2_form_36_1.png
cgram2.davies_bouldin_score().plot()
<AxesSubplot:>
../_images/leve2_form_37_1.png
labels2 = cgram2.labels.copy()
labels2.columns = labels2.columns.astype("str")  # parquet require str column names
labels2.to_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c2_form_labels.pq")
import pickle

with open("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c2_form_centers.pickle",'wb') as f:
    pickle.dump(cgram.cluster_centers, f)
labels = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_c2_form_labels.pq")
labels.index = data2.index
centres = []
for i in range(103):
    geom = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/tessellation/tess_{i}.pq", columns=["tessellation", "hindex"]).set_index("hindex")
    geom = geom.merge(labels, how="inner", left_index=True, right_index=True)
    centres.append(geom)
#     print(f"Chunk {i} done.")
centres = pd.concat(centres)
centres
tessellation 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
hindex
c000e109777t0000 POLYGON ((336287.376 427055.178, 336286.745 42... 0 0 2 0 3 0 0 0 7 ... 5 14 6 12 2 6 10 14 10 17
c000e109777t0001 POLYGON ((336192.633 427056.666, 336192.607 42... 0 0 2 0 3 0 0 0 7 ... 5 14 6 12 5 6 10 14 10 17
c000e109777t0002 POLYGON ((336202.056 427028.254, 336199.483 42... 0 0 2 0 3 0 0 0 7 ... 5 14 6 12 2 6 10 14 10 17
c000e109777t0003 POLYGON ((336194.678 427020.851, 336196.952 42... 0 0 2 0 3 0 0 0 7 ... 5 14 6 12 2 6 10 14 10 17
c000e109777t0004 POLYGON ((336220.377 427052.195, 336218.829 42... 0 0 2 0 3 0 0 0 7 ... 5 14 6 12 2 6 10 14 10 17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e175168t0000 POLYGON ((389544.610 395201.290, 389537.980 39... 0 1 2 0 2 0 5 7 1 ... 5 14 3 14 15 6 19 21 10 10
c102e174147t0000 POLYGON ((389612.230 395260.630, 389612.620 39... 0 1 2 0 2 0 5 7 7 ... 5 14 3 14 15 6 19 21 10 10
c102e174145t0000 POLYGON ((389596.600 395295.390, 389604.800 39... 0 1 2 0 2 0 5 7 1 ... 5 14 3 14 15 6 19 21 10 10
c102e174143t0000 POLYGON ((389579.000 395340.000, 389587.800 39... 0 1 2 0 2 0 5 7 7 ... 5 14 3 14 15 6 19 21 10 10
c102e220091t0000 POLYGON ((402112.440 396148.780, 402126.770 39... 0 1 1 1 1 3 3 3 5 ... 4 7 15 13 1 5 8 17 2 1

3451451 rows × 25 columns

import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import urbangrammar_graphics as ugg
import dask_geopandas
from utils.dask_geopandas import dask_dissolve
import warnings 

warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
centres = centres.sort_values("8").rename_geometry("geometry")[["8", "geometry"]]
centres.columns = ["clusters", "geometry"]
ddf = dask_geopandas.from_geopandas(centres, npartitions=64)
spsig = dask_dissolve(ddf, by="clusters").compute().reset_index(drop=True).explode()
spsig
clusters geometry
0 0 0 POLYGON Z ((402994.820 79047.418 0.000, 402995...
1 0 POLYGON Z ((402767.248 79523.698 0.000, 402767...
2 0 POLYGON Z ((408792.283 93254.679 0.000, 408792...
3 0 POLYGON Z ((408729.083 93264.282 0.000, 408729...
4 0 POLYGON Z ((408780.799 93346.550 0.000, 408780...
... ... ... ...
7 3752 7 POLYGON Z ((266588.875 846189.606 0.000, 26658...
3753 7 POLYGON Z ((309213.496 724405.657 0.000, 30921...
3754 7 POLYGON Z ((309344.531 724481.375 0.000, 30934...
3755 7 POLYGON Z ((310889.805 968260.309 0.000, 31088...
3756 7 POLYGON Z ((311991.353 968558.291 0.000, 31199...

49753 rows × 2 columns

spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster2_k8_form_GB.pq")
spsig.geometry = spsig.simplify(2).buffer(.001).simplify(2)
spsig.to_file(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster2_k8_form_GB_simplified.geojson", driver="GeoJSON")
spsig.to_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster2_k8_form_GB_simplified.pq")
spsig.to_file(f"sub_signatures_cluster2_k8_form_GB_simplified.gpkg", driver="GPKG")

Combine levels

import pandas as pd
import geopandas as gpd

spsig = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/signatures_form_K8_GB.pq")
level2_2 = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster2_k8_form_GB.pq")
level2_4 = gpd.read_parquet(f"../../urbangrammar_samba/spatial_signatures/signatures/sub_signatures_cluster4_k9_form_GB.pq")
level1 = spsig[~spsig.k8.isin([2, 4])]
level2_2["k8"] = 2
level2_4["k8"] = 4
level2_2.columns, level2_4.columns, level1.columns
(Index(['clusters', 'geometry', 'k8'], dtype='object'),
 Index(['clusters', 'geometry', 'k8'], dtype='object'),
 Index(['k8', 'tessellation'], dtype='object'))
level2_2.columns = ['level2', 'geometry', 'k8']
level2_4.columns = ['level2', 'geometry', 'k8']
level1 = level1.rename_geometry("geometry")
level1['level2'] = 0
combined = pd.concat([level1, level2_2, level2_4], axis=0)
combined = combined.reset_index(drop=True)
combined["signature_type"] = combined.k8.astype(str) + '_' + combined.level2.astype(int).astype(str)
combined.signature_type.nunique()
23
combined.to_parquet("../../urbangrammar_samba/spatial_signatures/signatures/signatures_form_combined_levels_orig.pq")
<ipython-input-18-dea8f4caae45>:1: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  combined.to_parquet("../../urbangrammar_samba/spatial_signatures/signatures/signatures_form_combined_levels_orig.pq")
import dask_geopandas

ddf = dask_geopandas.from_geopandas(combined, npartitions=512)
ddf.geometry = ddf.simplify(2).buffer(.001).simplify(2)
%time simplified = ddf.compute()
CPU times: user 28min 57s, sys: 4min 43s, total: 33min 41s
Wall time: 25min 55s
simplified.to_parquet("../../urbangrammar_samba/spatial_signatures/signatures/signatures_form_combined_levels_simplified.pq")
<ipython-input-20-dadb8c830952>:1: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  simplified.to_parquet("../../urbangrammar_samba/spatial_signatures/signatures/signatures_form_combined_levels_simplified.pq")
simplified.to_file("signatures_form_combined_levels_simplified.gpkg", driver="GPKG")