Function-based signatures across GB

This notebook explores creation of signatures based on form data only.

The method mirrors creation of standard spatial signatures.

import dask.dataframe
import numpy as np
import pandas as pd

from clustergram import Clustergram
data = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()
data
CPU times: user 24.5 s, sys: 27.6 s, total: 52.1 s
Wall time: 44.4 s
population_q1 population_q2 population_q3 night_lights_q1 night_lights_q2 night_lights_q3 A, B, D, E. Agriculture, energy and water_q1 A, B, D, E. Agriculture, energy and water_q2 A, B, D, E. Agriculture, energy and water_q3 C. Manufacturing_q1 ... Code_18_521_q2 Code_18_334_q3 Code_18_244_q1 Code_18_244_q2 Code_18_331_q3 Code_18_132_q2 Code_18_132_q3 Code_18_521_q1 Code_18_222_q2 Code_18_521_q3
hindex
c000e109777t0000 -0.206314 0.365497 0.291477 -0.409151 -0.446148 -0.456031 -0.843645 -0.680719 -0.464860 -0.379286 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c000e109777t0001 -0.208114 0.364186 0.274707 -0.385136 -0.411700 -0.456031 -0.971278 -0.729707 -0.463592 -0.372725 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c000e109777t0002 -0.142215 0.414750 0.298747 -0.404727 -0.431615 -0.450650 -0.891030 -0.694121 -0.466906 -0.353996 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c000e109777t0003 -0.080063 0.383735 0.286717 -0.420736 -0.446148 -0.456031 -0.854549 -0.740079 -0.491802 -0.336752 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c000e109777t0004 -0.114962 0.413013 0.298597 -0.409151 -0.446148 -0.456031 -0.887856 -0.694495 -0.466758 -0.355091 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
c102e634855t0000 -0.772446 -0.547355 -0.368144 -0.715754 -0.701275 -0.645591 0.011376 0.305189 0.508793 1.032406 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c102e634854t0000 -0.865883 -0.694259 -0.458359 -0.730629 -0.743258 -0.696025 0.219933 0.540991 0.675152 1.003821 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c102e634847t0000 -0.772446 -0.547355 -0.368144 -0.715754 -0.701275 -0.645591 0.011376 0.305189 0.508793 1.032406 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c102e634850t0000 -0.850124 -0.665683 -0.441754 -0.726603 -0.735184 -0.690808 0.194017 0.493856 0.670424 1.125250 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN
c102e614920t0000 -0.791901 -0.572008 -0.399926 -0.715938 -0.699391 -0.645591 0.103731 0.359254 0.589599 1.034812 ... NaN NaN NaN NaN -0.008758 NaN -0.000679 NaN -0.009142 NaN

14539578 rows × 151 columns

data.columns.values
array(['population_q1', 'population_q2', 'population_q3',
       'night_lights_q1', 'night_lights_q2', 'night_lights_q3',
       'A, B, D, E. Agriculture, energy and water_q1',
       'A, B, D, E. Agriculture, energy and water_q2',
       'A, B, D, E. Agriculture, energy and water_q3',
       'C. Manufacturing_q1', 'C. Manufacturing_q2',
       'C. Manufacturing_q3', 'F. Construction_q1', 'F. Construction_q2',
       'F. Construction_q3',
       'G, I. Distribution, hotels and restaurants_q1',
       'G, I. Distribution, hotels and restaurants_q2',
       'G, I. Distribution, hotels and restaurants_q3',
       'H, J. Transport and communication_q1',
       'H, J. Transport and communication_q2',
       'H, J. Transport and communication_q3',
       'K, L, M, N. Financial, real estate, professional and administrative activities_q1',
       'K, L, M, N. Financial, real estate, professional and administrative activities_q2',
       'K, L, M, N. Financial, real estate, professional and administrative activities_q3',
       'O,P,Q. Public administration, education and health_q1',
       'O,P,Q. Public administration, education and health_q2',
       'O,P,Q. Public administration, education and health_q3',
       'R, S, T, U. Other_q1', 'R, S, T, U. Other_q2',
       'R, S, T, U. Other_q3', 'Code_18_124_q1', 'Code_18_124_q2',
       'Code_18_124_q3', 'Code_18_211_q1', 'Code_18_211_q2',
       'Code_18_211_q3', 'Code_18_121_q1', 'Code_18_121_q2',
       'Code_18_121_q3', 'Code_18_421_q1', 'Code_18_421_q2',
       'Code_18_421_q3', 'Code_18_522_q1', 'Code_18_522_q2',
       'Code_18_522_q3', 'Code_18_142_q1', 'Code_18_142_q2',
       'Code_18_142_q3', 'Code_18_141_q1', 'Code_18_141_q2',
       'Code_18_141_q3', 'Code_18_112_q1', 'Code_18_112_q2',
       'Code_18_112_q3', 'Code_18_231_q1', 'Code_18_231_q2',
       'Code_18_231_q3', 'Code_18_311_q1', 'Code_18_311_q2',
       'Code_18_311_q3', 'Code_18_131_q1', 'Code_18_131_q2',
       'Code_18_131_q3', 'Code_18_123_q1', 'Code_18_123_q2',
       'Code_18_123_q3', 'Code_18_122_q1', 'Code_18_122_q2',
       'Code_18_122_q3', 'Code_18_512_q1', 'Code_18_512_q2',
       'Code_18_512_q3', 'Code_18_243_q1', 'Code_18_243_q2',
       'Code_18_243_q3', 'Code_18_313_q1', 'Code_18_313_q2',
       'Code_18_313_q3', 'Code_18_412_q1', 'Code_18_412_q2',
       'Code_18_412_q3', 'Code_18_321_q1', 'Code_18_321_q2',
       'Code_18_321_q3', 'Code_18_322_q1', 'Code_18_322_q2',
       'Code_18_322_q3', 'Code_18_324_q1', 'Code_18_324_q2',
       'Code_18_324_q3', 'Code_18_111_q1', 'Code_18_111_q2',
       'Code_18_111_q3', 'Code_18_423_q1', 'Code_18_423_q2',
       'Code_18_423_q3', 'Code_18_523_q1', 'Code_18_523_q2',
       'Code_18_523_q3', 'mean_q1', 'mean_q2', 'mean_q3',
       'Code_18_312_q1', 'Code_18_312_q2', 'Code_18_312_q3',
       'Code_18_133_q1', 'Code_18_133_q2', 'Code_18_133_q3',
       'Code_18_333_q1', 'Code_18_333_q2', 'Code_18_333_q3',
       'Code_18_332_q1', 'Code_18_332_q2', 'Code_18_332_q3',
       'Code_18_411_q1', 'Code_18_411_q2', 'Code_18_411_q3',
       'supermarkets_nearest', 'supermarkets_counts', 'listed_nearest',
       'listed_counts', 'fhrs_nearest', 'fhrs_counts', 'culture_nearest',
       'culture_counts', 'nearest_water', 'nearest_retail_centre',
       'Code_18_132_q1', 'Code_18_331_q2', 'Code_18_222_q1',
       'Code_18_511_q3', 'Code_18_242_q1', 'Code_18_511_q2',
       'Code_18_242_q3', 'Code_18_331_q1', 'Code_18_334_q2',
       'Code_18_511_q1', 'Code_18_334_q1', 'Code_18_222_q3',
       'Code_18_242_q2', 'Code_18_244_q3', 'Code_18_521_q2',
       'Code_18_334_q3', 'Code_18_244_q1', 'Code_18_244_q2',
       'Code_18_331_q3', 'Code_18_132_q2', 'Code_18_132_q3',
       'Code_18_521_q1', 'Code_18_222_q2', 'Code_18_521_q3'], dtype=object)
data = data.replace([np.inf, -np.inf], np.nan).fillna(0)
cgram = Clustergram(range(1, 25), method='minibatchkmeans', batch_size=1_000_000, n_init=100, random_state=42)
cgram.fit(data)
K=1 skipped. Mean computed from data directly.
K=2 fitted in 438.57887077331543 seconds.
K=3 fitted in 481.95261907577515 seconds.
K=4 fitted in 521.4547460079193 seconds.
K=5 fitted in 561.0414683818817 seconds.
K=6 fitted in 600.828937292099 seconds.
K=7 fitted in 764.945318698883 seconds.
K=8 fitted in 829.6346256732941 seconds.
K=9 fitted in 874.9183557033539 seconds.
K=10 fitted in 915.4329822063446 seconds.
K=11 fitted in 963.5221800804138 seconds.
K=12 fitted in 1007.6710164546967 seconds.
K=13 fitted in 1039.0629951953888 seconds.
K=14 fitted in 1076.3578605651855 seconds.
K=15 fitted in 1117.3909075260162 seconds.
K=16 fitted in 1192.9363079071045 seconds.
K=17 fitted in 1239.1707978248596 seconds.
K=18 fitted in 1289.4472596645355 seconds.
K=19 fitted in 1316.8120160102844 seconds.
K=20 fitted in 1358.3061792850494 seconds.
K=21 fitted in 1438.8004968166351 seconds.
K=22 fitted in 1493.9968583583832 seconds.
K=23 fitted in 1537.079713344574 seconds.
K=24 fitted in 1568.364884853363 seconds.
import urbangrammar_graphics as ugg
import seaborn as sns

sns.set(style='whitegrid')
%%time

ax = cgram.plot(
    figsize=(20, 20),
    line_style=dict(color=ugg.COLORS[1]),
    cluster_style={"color": ugg.COLORS[2]},
)
ax.yaxis.grid(False)
sns.despine(offset=10)
ax.set_ylim(-20, 30)
CPU times: user 6min 58s, sys: 1min 20s, total: 8min 18s
Wall time: 1min 52s
(-20.0, 30.0)
../_images/function_signatures_GB_7_2.png
from bokeh.io import output_notebook
from bokeh.plotting import show

output_notebook()
Loading BokehJS ...
fig = cgram.bokeh(
    figsize=(800, 600),
    line_style=dict(color=ugg.HEX[1]),
    cluster_style={"color": ugg.HEX[2]},
)
show(fig)