Function-based signatures across GB¶
This notebook explores creation of signatures based on form data only.
The method mirrors creation of standard spatial signatures.
import dask.dataframe
import numpy as np
import pandas as pd
from clustergram import Clustergram
data = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()
data
CPU times: user 24.5 s, sys: 27.6 s, total: 52.1 s
Wall time: 44.4 s
population_q1 | population_q2 | population_q3 | night_lights_q1 | night_lights_q2 | night_lights_q3 | A, B, D, E. Agriculture, energy and water_q1 | A, B, D, E. Agriculture, energy and water_q2 | A, B, D, E. Agriculture, energy and water_q3 | C. Manufacturing_q1 | ... | Code_18_521_q2 | Code_18_334_q3 | Code_18_244_q1 | Code_18_244_q2 | Code_18_331_q3 | Code_18_132_q2 | Code_18_132_q3 | Code_18_521_q1 | Code_18_222_q2 | Code_18_521_q3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
hindex | |||||||||||||||||||||
c000e109777t0000 | -0.206314 | 0.365497 | 0.291477 | -0.409151 | -0.446148 | -0.456031 | -0.843645 | -0.680719 | -0.464860 | -0.379286 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c000e109777t0001 | -0.208114 | 0.364186 | 0.274707 | -0.385136 | -0.411700 | -0.456031 | -0.971278 | -0.729707 | -0.463592 | -0.372725 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c000e109777t0002 | -0.142215 | 0.414750 | 0.298747 | -0.404727 | -0.431615 | -0.450650 | -0.891030 | -0.694121 | -0.466906 | -0.353996 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c000e109777t0003 | -0.080063 | 0.383735 | 0.286717 | -0.420736 | -0.446148 | -0.456031 | -0.854549 | -0.740079 | -0.491802 | -0.336752 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c000e109777t0004 | -0.114962 | 0.413013 | 0.298597 | -0.409151 | -0.446148 | -0.456031 | -0.887856 | -0.694495 | -0.466758 | -0.355091 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
c102e634855t0000 | -0.772446 | -0.547355 | -0.368144 | -0.715754 | -0.701275 | -0.645591 | 0.011376 | 0.305189 | 0.508793 | 1.032406 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c102e634854t0000 | -0.865883 | -0.694259 | -0.458359 | -0.730629 | -0.743258 | -0.696025 | 0.219933 | 0.540991 | 0.675152 | 1.003821 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c102e634847t0000 | -0.772446 | -0.547355 | -0.368144 | -0.715754 | -0.701275 | -0.645591 | 0.011376 | 0.305189 | 0.508793 | 1.032406 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c102e634850t0000 | -0.850124 | -0.665683 | -0.441754 | -0.726603 | -0.735184 | -0.690808 | 0.194017 | 0.493856 | 0.670424 | 1.125250 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
c102e614920t0000 | -0.791901 | -0.572008 | -0.399926 | -0.715938 | -0.699391 | -0.645591 | 0.103731 | 0.359254 | 0.589599 | 1.034812 | ... | NaN | NaN | NaN | NaN | -0.008758 | NaN | -0.000679 | NaN | -0.009142 | NaN |
14539578 rows × 151 columns
data.columns.values
array(['population_q1', 'population_q2', 'population_q3',
'night_lights_q1', 'night_lights_q2', 'night_lights_q3',
'A, B, D, E. Agriculture, energy and water_q1',
'A, B, D, E. Agriculture, energy and water_q2',
'A, B, D, E. Agriculture, energy and water_q3',
'C. Manufacturing_q1', 'C. Manufacturing_q2',
'C. Manufacturing_q3', 'F. Construction_q1', 'F. Construction_q2',
'F. Construction_q3',
'G, I. Distribution, hotels and restaurants_q1',
'G, I. Distribution, hotels and restaurants_q2',
'G, I. Distribution, hotels and restaurants_q3',
'H, J. Transport and communication_q1',
'H, J. Transport and communication_q2',
'H, J. Transport and communication_q3',
'K, L, M, N. Financial, real estate, professional and administrative activities_q1',
'K, L, M, N. Financial, real estate, professional and administrative activities_q2',
'K, L, M, N. Financial, real estate, professional and administrative activities_q3',
'O,P,Q. Public administration, education and health_q1',
'O,P,Q. Public administration, education and health_q2',
'O,P,Q. Public administration, education and health_q3',
'R, S, T, U. Other_q1', 'R, S, T, U. Other_q2',
'R, S, T, U. Other_q3', 'Code_18_124_q1', 'Code_18_124_q2',
'Code_18_124_q3', 'Code_18_211_q1', 'Code_18_211_q2',
'Code_18_211_q3', 'Code_18_121_q1', 'Code_18_121_q2',
'Code_18_121_q3', 'Code_18_421_q1', 'Code_18_421_q2',
'Code_18_421_q3', 'Code_18_522_q1', 'Code_18_522_q2',
'Code_18_522_q3', 'Code_18_142_q1', 'Code_18_142_q2',
'Code_18_142_q3', 'Code_18_141_q1', 'Code_18_141_q2',
'Code_18_141_q3', 'Code_18_112_q1', 'Code_18_112_q2',
'Code_18_112_q3', 'Code_18_231_q1', 'Code_18_231_q2',
'Code_18_231_q3', 'Code_18_311_q1', 'Code_18_311_q2',
'Code_18_311_q3', 'Code_18_131_q1', 'Code_18_131_q2',
'Code_18_131_q3', 'Code_18_123_q1', 'Code_18_123_q2',
'Code_18_123_q3', 'Code_18_122_q1', 'Code_18_122_q2',
'Code_18_122_q3', 'Code_18_512_q1', 'Code_18_512_q2',
'Code_18_512_q3', 'Code_18_243_q1', 'Code_18_243_q2',
'Code_18_243_q3', 'Code_18_313_q1', 'Code_18_313_q2',
'Code_18_313_q3', 'Code_18_412_q1', 'Code_18_412_q2',
'Code_18_412_q3', 'Code_18_321_q1', 'Code_18_321_q2',
'Code_18_321_q3', 'Code_18_322_q1', 'Code_18_322_q2',
'Code_18_322_q3', 'Code_18_324_q1', 'Code_18_324_q2',
'Code_18_324_q3', 'Code_18_111_q1', 'Code_18_111_q2',
'Code_18_111_q3', 'Code_18_423_q1', 'Code_18_423_q2',
'Code_18_423_q3', 'Code_18_523_q1', 'Code_18_523_q2',
'Code_18_523_q3', 'mean_q1', 'mean_q2', 'mean_q3',
'Code_18_312_q1', 'Code_18_312_q2', 'Code_18_312_q3',
'Code_18_133_q1', 'Code_18_133_q2', 'Code_18_133_q3',
'Code_18_333_q1', 'Code_18_333_q2', 'Code_18_333_q3',
'Code_18_332_q1', 'Code_18_332_q2', 'Code_18_332_q3',
'Code_18_411_q1', 'Code_18_411_q2', 'Code_18_411_q3',
'supermarkets_nearest', 'supermarkets_counts', 'listed_nearest',
'listed_counts', 'fhrs_nearest', 'fhrs_counts', 'culture_nearest',
'culture_counts', 'nearest_water', 'nearest_retail_centre',
'Code_18_132_q1', 'Code_18_331_q2', 'Code_18_222_q1',
'Code_18_511_q3', 'Code_18_242_q1', 'Code_18_511_q2',
'Code_18_242_q3', 'Code_18_331_q1', 'Code_18_334_q2',
'Code_18_511_q1', 'Code_18_334_q1', 'Code_18_222_q3',
'Code_18_242_q2', 'Code_18_244_q3', 'Code_18_521_q2',
'Code_18_334_q3', 'Code_18_244_q1', 'Code_18_244_q2',
'Code_18_331_q3', 'Code_18_132_q2', 'Code_18_132_q3',
'Code_18_521_q1', 'Code_18_222_q2', 'Code_18_521_q3'], dtype=object)
data = data.replace([np.inf, -np.inf], np.nan).fillna(0)
cgram = Clustergram(range(1, 25), method='minibatchkmeans', batch_size=1_000_000, n_init=100, random_state=42)
cgram.fit(data)
K=1 skipped. Mean computed from data directly.
K=2 fitted in 438.57887077331543 seconds.
K=3 fitted in 481.95261907577515 seconds.
K=4 fitted in 521.4547460079193 seconds.
K=5 fitted in 561.0414683818817 seconds.
K=6 fitted in 600.828937292099 seconds.
K=7 fitted in 764.945318698883 seconds.
K=8 fitted in 829.6346256732941 seconds.
K=9 fitted in 874.9183557033539 seconds.
K=10 fitted in 915.4329822063446 seconds.
K=11 fitted in 963.5221800804138 seconds.
K=12 fitted in 1007.6710164546967 seconds.
K=13 fitted in 1039.0629951953888 seconds.
K=14 fitted in 1076.3578605651855 seconds.
K=15 fitted in 1117.3909075260162 seconds.
K=16 fitted in 1192.9363079071045 seconds.
K=17 fitted in 1239.1707978248596 seconds.
K=18 fitted in 1289.4472596645355 seconds.
K=19 fitted in 1316.8120160102844 seconds.
K=20 fitted in 1358.3061792850494 seconds.
K=21 fitted in 1438.8004968166351 seconds.
K=22 fitted in 1493.9968583583832 seconds.
K=23 fitted in 1537.079713344574 seconds.
K=24 fitted in 1568.364884853363 seconds.
import urbangrammar_graphics as ugg
import seaborn as sns
sns.set(style='whitegrid')
%%time
ax = cgram.plot(
figsize=(20, 20),
line_style=dict(color=ugg.COLORS[1]),
cluster_style={"color": ugg.COLORS[2]},
)
ax.yaxis.grid(False)
sns.despine(offset=10)
ax.set_ylim(-20, 30)
CPU times: user 6min 58s, sys: 1min 20s, total: 8min 18s
Wall time: 1min 52s
(-20.0, 30.0)

from bokeh.io import output_notebook
from bokeh.plotting import show
output_notebook()
fig = cgram.bokeh(
figsize=(800, 600),
line_style=dict(color=ugg.HEX[1]),
cluster_style={"color": ugg.HEX[2]},
)
show(fig)