Feature importance per signature type¶
This notebooks analyses which characters are more important for each individual signature type. In other words, what makes each cluster unique compared to all the other.
import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe
import matplotlib.pyplot as plt
import urbangrammar_graphics as ugg
import seaborn as sns
from matplotlib.lines import Line2D
from sklearn.ensemble import RandomForestClassifier
%time standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index('hindex')
%time stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
%time data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
%time data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()
CPU times: user 19.2 s, sys: 3.26 s, total: 22.5 s
Wall time: 21.4 s
CPU times: user 72.3 ms, sys: 4.09 ms, total: 76.4 ms
Wall time: 116 ms
CPU times: user 39.6 ms, sys: 7.9 ms, total: 47.5 ms
Wall time: 41.4 ms
CPU times: user 18.9 ms, sys: 0 ns, total: 18.9 ms
Wall time: 18.7 ms
CPU times: user 2min 41s, sys: 1min 27s, total: 4min 8s
Wall time: 2min 45s
labels_l1 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")
labels_l2_9 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq")
labels_l2_2 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq")
labels = labels_l1.copy()
labels.loc[labels.kmeans10gb == 9, 'kmeans10gb'] = labels_l2_9['9'].values + 90
labels.loc[labels.kmeans10gb == 2, 'kmeans10gb'] = labels_l2_2['subclustering_cluster2_k3'].values + 20
outliers = [98, 93, 96, 97]
mask = ~labels.kmeans10gb.isin(outliers)
Feature importance per cluster¶
labels.kmeans10gb.unique()
array([ 4,  0,  6,  1, 21,  7,  3,  5, 90, 20,  8, 22, 98, 92, 94, 91, 95,
       96, 93, 97], dtype=int32)
imps = pd.DataFrame()
for cluster in labels.kmeans10gb.unique():
    if cluster not in outliers:
        cluster_bool = labels.loc[mask]['kmeans10gb'].apply(lambda x: 1 if x == cluster else 0)
        clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42, verbose=1)
        clf = clf.fit(data.loc[mask].values, cluster_bool.values)
        importances = pd.Series(clf.feature_importances_, index=data.columns).sort_values(ascending=False)
        imps[f'cluster_{cluster}'] = importances.head(50).index.values
        imps[f'cluster_{cluster}_vals'] = importances.head(50).values
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 10.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.8min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.0min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.9min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.5min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.9min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.5min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.0min finished
chars = [c for c in imps.columns if 'vals' not in c]
imps[sorted(chars)]
| cluster_0 | cluster_1 | cluster_20 | cluster_21 | cluster_22 | cluster_3 | cluster_4 | cluster_5 | cluster_6 | cluster_7 | cluster_8 | cluster_90 | cluster_91 | cluster_92 | cluster_94 | cluster_95 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | sicCAR_q1 | ltcWRE_q3 | ssbCCM_q2 | stbCeA_q1 | sdbPer_q2 | ldsAre_q1 | sdcLAL_q1 | ssbElo_q1 | lcnClo_q3 | mdcAre_q2 | linPDE_q2 | sdbPer_q2 | sdbAre_q1 | ssbCCD_q2 | ssbERI_q2 | sicCAR_q2 | 
| 1 | sicCAR_q2 | ltcRea_q3 | ssbCCM_q3 | linP4W_q2 | ssbCCM_q2 | mdsAre_q1 | sicCAR_q2 | ssbCCM_q3 | lcnClo_q2 | sicCAR_q2 | lcdMes_q3 | ssbERI_q1 | G, I. Distribution, hotels and restaurants_q2 | ssbCCM_q2 | ssbCCM_q2 | C. Manufacturing_q2 | 
| 2 | ltbIBD_q2 | ltcAre_q2 | sdbAre_q3 | stbCeA_q2 | sdbPer_q3 | sddAre_q2 | sicCAR_q1 | ssbElo_q2 | ldePer_q1 | mtdMDi_q2 | lcdMes_q2 | ssbCCM_q2 | K, L, M, N. Financial, real estate, profession... | ssbSqu_q3 | ssbCCD_q2 | R, S, T, U. Other_q2 | 
| 3 | sdcAre_q2 | sdcAre_q2 | population_q3 | ldeAre_q2 | sdbAre_q2 | sicCAR_q2 | sdcAre_q2 | ssbCCo_q1 | ldeAre_q2 | sicCAR_q1 | ssbERI_q1 | ssbSqu_q3 | R, S, T, U. Other_q2 | K, L, M, N. Financial, real estate, profession... | ssbCor_q2 | G, I. Distribution, hotels and restaurants_q2 | 
| 4 | sddAre_q2 | misCel_q1 | sdbPer_q2 | stcOri_q2 | population_q3 | lddNDe_q3 | mtcWNe_q3 | ssbCCD_q2 | lcnClo_q1 | mdsAre_q1 | ssbCCo_q1 | sdbAre_q2 | G, I. Distribution, hotels and restaurants_q3 | sdbPer_q2 | K, L, M, N. Financial, real estate, profession... | sicCAR_q1 | 
| 5 | mtdMDi_q2 | misCel_q2 | sdbAre_q2 | ssbERI_q1 | ssbSqu_q3 | mdsAre_q2 | mdsAre_q1 | sdbPer_q3 | ltcWRE_q3 | sscCCo_q2 | population_q1 | ssbCCD_q3 | K, L, M, N. Financial, real estate, profession... | sdbPer_q3 | G, I. Distribution, hotels and restaurants_q2 | C. Manufacturing_q3 | 
| 6 | mdsAre_q1 | lisCel_q2 | ldePer_q1 | linP4W_q1 | ssbCCD_q3 | sicCAR_q1 | ltcAre_q1 | sdsSPW_q2 | linP4W_q2 | mdcAre_q1 | ssbElo_q2 | K, L, M, N. Financial, real estate, profession... | C. Manufacturing_q2 | sdbAre_q2 | sdbPer_q2 | ssbCCM_q2 | 
| 7 | Code_18_112_q2 | mtcWNe_q1 | lteOri_q2 | ldePer_q1 | K, L, M, N. Financial, real estate, profession... | ldeAre_q2 | sdcAre_q3 | ssbCCo_q2 | sddAre_q1 | sisBpM_q2 | mdsAre_q2 | G, I. Distribution, hotels and restaurants_q2 | sdbPer_q2 | G, I. Distribution, hotels and restaurants_q2 | ssbSqu_q3 | sdbPer_q2 | 
| 8 | mtcWNe_q2 | ltcAre_q1 | sdbPer_q3 | linPDE_q1 | ssbERI_q1 | lseCWA_q3 | mtbNDi_q2 | ltcAre_q1 | sddAre_q2 | ltcAre_q1 | sdsAre_q3 | sdbPer_q3 | ssbCCM_q2 | ssbCor_q3 | K, L, M, N. Financial, real estate, profession... | sdsSPO_q2 | 
| 9 | sdcLAL_q2 | lisCel_q1 | ldeAre_q1 | lteOri_q1 | R, S, T, U. Other_q2 | sdcAre_q2 | ltbIBD_q2 | sdbPer_q2 | ltcWRE_q2 | sddAre_q3 | ssbCCo_q2 | sdbAre_q1 | Code_18_211_q1 | ssbCCD_q3 | ssbCCM_q1 | mean_q3 | 
| 10 | mtbNDi_q2 | sicCAR_q1 | stcOri_q3 | sdbPer_q3 | sdbPer_q1 | lisCel_q1 | ltcWRE_q1 | mtbNDi_q3 | lcdMes_q3 | mdsAre_q3 | linPDE_q1 | R, S, T, U. Other_q3 | ssbCCD_q1 | ssbERI_q1 | ssbCCD_q3 | ssbSqu_q1 | 
| 11 | ltcAre_q1 | ssbERI_q1 | stbCeA_q2 | stbOri_q2 | sdbAre_q3 | ssbElo_q2 | mtcWNe_q2 | ltbIBD_q3 | ldeAre_q1 | sdsAre_q3 | stcOri_q2 | ssbCCD_q2 | G, I. Distribution, hotels and restaurants_q1 | ssbSqu_q2 | F. Construction_q3 | sdbAre_q1 | 
| 12 | mdcAre_q2 | sdcLAL_q3 | ssbCCo_q1 | ssbCCo_q3 | O,P,Q. Public administration, education and he... | ltcWRE_q1 | sdcLAL_q2 | ltcWRE_q2 | linP4W_q1 | mdsAre_q2 | stbOri_q2 | H, J. Transport and communication_q1 | sscCCo_q3 | O,P,Q. Public administration, education and he... | F. Construction_q2 | H, J. Transport and communication_q3 | 
| 13 | sdsAre_q2 | ltcRea_q1 | stcOri_q2 | ssbCCD_q2 | ssbCCM_q3 | stbSAl_q2 | mtbNDi_q3 | population_q3 | sdsAre_q2 | mtbNDi_q3 | ssbCCD_q2 | ssbERI_q2 | F. Construction_q3 | F. Construction_q3 | K, L, M, N. Financial, real estate, profession... | R, S, T, U. Other_q1 | 
| 14 | mtcWNe_q3 | sisBpM_q3 | stbOri_q2 | sdbAre_q2 | mean_q1 | stbCeA_q1 | mdcAre_q2 | ltcWRE_q1 | stbCeA_q2 | sdcAre_q2 | ssbElo_q1 | ssbCor_q3 | H, J. Transport and communication_q2 | ssbCCM_q3 | sicCAR_q3 | ssbCCD_q1 | 
| 15 | sisBpM_q3 | ldsMSL_q1 | stbOri_q1 | population_q2 | ssbCCM_q1 | sddAre_q3 | sisBpM_q3 | sdsSPW_q3 | ldsCDL_q3 | sscERI_q2 | mdsAre_q3 | ssbSqu_q2 | sicCAR_q3 | ssbERI_q2 | mtbAli_q2 | sisBpM_q3 | 
| 16 | sicCAR_q3 | sdsSWD_q3 | lcdMes_q2 | ssbCCM_q3 | ssbERI_q2 | ltcWRE_q2 | sisBpM_q1 | ltbIBD_q1 | mtdMDi_q1 | sscERI_q1 | lteOri_q1 | G, I. Distribution, hotels and restaurants_q3 | ssbERI_q2 | ssbCor_q2 | R, S, T, U. Other_q3 | sdcAre_q1 | 
| 17 | sdcLAL_q1 | ssbCCo_q1 | sicCAR_q3 | lteWNB_q2 | ssbCCD_q2 | lisCel_q2 | ltcWRE_q2 | sicCAR_q2 | mtcWNe_q3 | linWID_q1 | mtcWNe_q1 | misCel_q3 | F. Construction_q2 | R, S, T, U. Other_q3 | sdcAre_q1 | O,P,Q. Public administration, education and he... | 
| 18 | mtdMDi_q1 | ssbCCM_q2 | stcOri_q1 | stbOri_q1 | K, L, M, N. Financial, real estate, profession... | lddNDe_q2 | ltcAre_q2 | night_lights_q1 | lcdMes_q2 | population_q1 | sscCCo_q2 | sdbAre_q3 | A, B, D, E. Agriculture, energy and water_q3 | sdbAre_q3 | mean_q1 | H, J. Transport and communication_q2 | 
| 19 | ldsAre_q2 | sdcLAL_q1 | stbCeA_q1 | ssbElo_q3 | sdbAre_q1 | stbCeA_q2 | sscERI_q2 | mtcWNe_q2 | sdcLAL_q1 | sdcAre_q3 | stcOri_q1 | O,P,Q. Public administration, education and he... | lseCWA_q1 | mean_q1 | ssbSqu_q1 | A, B, D, E. Agriculture, energy and water_q2 | 
| 20 | sdsSPO_q1 | linPDE_q3 | mean_q1 | ssbCCM_q2 | sicCAR_q3 | linWID_q1 | mtdMDi_q3 | linP3W_q3 | mtdMDi_q2 | population_q3 | stcOri_q3 | misCel_q2 | linP4W_q3 | K, L, M, N. Financial, real estate, profession... | H, J. Transport and communication_q2 | ltcAre_q3 | 
| 21 | ltbIBD_q1 | misCel_q3 | stbOri_q3 | lcdMes_q3 | R, S, T, U. Other_q3 | sssLin_q2 | sdsSPO_q2 | sdsSWD_q3 | ltcWRE_q1 | mtdMDi_q3 | population_q2 | mean_q1 | R, S, T, U. Other_q1 | G, I. Distribution, hotels and restaurants_q3 | ssbSqu_q2 | ldsCDL_q2 | 
| 22 | sdsLen_q2 | sdcAre_q3 | ldeAre_q2 | lcdMes_q2 | G, I. Distribution, hotels and restaurants_q2 | ssbCCo_q1 | sdcAre_q1 | ssbERI_q1 | ldsAre_q2 | sdsSPO_q2 | lcdMes_q1 | ldePer_q2 | K, L, M, N. Financial, real estate, profession... | sicCAR_q3 | R, S, T, U. Other_q1 | mdcAre_q2 | 
| 23 | mtbNDi_q3 | lisCel_q3 | sddAre_q3 | ltcWRE_q3 | night_lights_q3 | mtdMDi_q3 | sisBpM_q2 | ssbCCo_q3 | sdcAre_q1 | stbCeA_q1 | stbOri_q3 | K, L, M, N. Financial, real estate, profession... | A, B, D, E. Agriculture, energy and water_q2 | ssbCCD_q1 | C. Manufacturing_q2 | H, J. Transport and communication_q1 | 
| 24 | lddNDe_q2 | sdsSWD_q2 | lteOri_q1 | lteOri_q2 | ssbCor_q3 | ldePer_q1 | Code_18_231_q1 | linP3W_q2 | population_q2 | sdcLAL_q2 | ssbCCo_q3 | ssbCCM_q1 | linP4W_q1 | misCel_q3 | ssbCCM_q3 | G, I. Distribution, hotels and restaurants_q1 | 
| 25 | sisBpM_q2 | stbSAl_q2 | K, L, M, N. Financial, real estate, profession... | G, I. Distribution, hotels and restaurants_q3 | G, I. Distribution, hotels and restaurants_q3 | stbSAl_q3 | ltcAre_q3 | sicCAR_q3 | lseCWA_q1 | linWID_q2 | ldsCDL_q1 | ssbCCM_q3 | H, J. Transport and communication_q3 | sscERI_q3 | sdbAre_q1 | ldsAre_q1 | 
| 26 | ldsAre_q1 | ssbElo_q1 | ssbCCD_q3 | linPDE_q3 | population_q2 | misCel_q2 | ldsCDL_q1 | mtcWNe_q3 | linWID_q2 | ltcWRE_q2 | ssbCCM_q3 | H, J. Transport and communication_q3 | linP3W_q3 | R, S, T, U. Other_q2 | lseCWA_q2 | ldeAre_q1 | 
| 27 | sdsAre_q1 | ssbCCo_q2 | ltcAre_q1 | population_q3 | night_lights_q2 | ssbERI_q1 | mdsAre_q3 | stbCeA_q1 | ssbCCM_q1 | sscCCo_q1 | linPDE_q3 | K, L, M, N. Financial, real estate, profession... | R, S, T, U. Other_q3 | H, J. Transport and communication_q3 | Code_18_111_q2 | listed_counts | 
| 28 | sdcAre_q3 | stbCeA_q1 | linP4W_q2 | stcOri_q3 | O,P,Q. Public administration, education and he... | ldePer_q2 | mean_q3 | linP4W_q2 | lteWNB_q1 | sddAre_q2 | sscCCo_q1 | ssbElo_q1 | sicCAR_q2 | sdcAre_q3 | sdbAre_q2 | K, L, M, N. Financial, real estate, profession... | 
| 29 | ldsMSL_q2 | mdcAre_q2 | linP4W_q3 | lcdMes_q1 | ltcWRE_q2 | ldsMSL_q2 | Code_18_231_q2 | ltcWRE_q3 | linP4W_q3 | ltcWRE_q1 | stbOri_q1 | Code_18_111_q3 | ssbCCM_q1 | Code_18_111_q2 | G, I. Distribution, hotels and restaurants_q3 | mtbNDi_q3 | 
| 30 | mdsAre_q2 | ldeAre_q2 | ssbSqu_q1 | misCel_q2 | sicCAR_q2 | linPDE_q3 | ldsCDL_q2 | linPDE_q2 | linPDE_q2 | ssbCCM_q3 | linP4W_q2 | listed_counts | sdsSWD_q1 | ssbSqu_q1 | ssbCCD_q1 | mtcWNe_q3 | 
| 31 | ltcAre_q3 | linPDE_q2 | lcnClo_q2 | stbOri_q3 | lcnClo_q3 | linWID_q2 | Code_18_211_q3 | ldePer_q1 | ldePer_q2 | sdsSPO_q1 | ssbElo_q3 | H, J. Transport and communication_q2 | ssbSqu_q1 | H, J. Transport and communication_q2 | R, S, T, U. Other_q2 | stcOri_q2 | 
| 32 | sdsSPO_q2 | ssbCCo_q3 | R, S, T, U. Other_q3 | stcOri_q1 | ssbCCo_q3 | ssbCCo_q2 | sicCAR_q3 | stbCeA_q3 | sdsSPW_q3 | night_lights_q1 | lteOri_q2 | R, S, T, U. Other_q2 | mdcAre_q3 | Code_18_111_q3 | lisCel_q2 | sisBpM_q2 | 
| 33 | mdcAre_q1 | sisBpM_q2 | ssbERI_q1 | linP4W_q3 | ssbCCo_q2 | lseERI_q2 | night_lights_q2 | ldsCDL_q1 | linPDE_q3 | ssbCCo_q1 | ssbSqu_q3 | O,P,Q. Public administration, education and he... | ltcAre_q1 | lseCWA_q1 | H, J. Transport and communication_q3 | ssbCCD_q3 | 
| 34 | lddNDe_q3 | stbCeA_q3 | sdcAre_q2 | ssbSqu_q1 | linP4W_q2 | sdsSPO_q3 | Code_18_211_q2 | mdcAre_q2 | ssbERI_q1 | sdsLen_q1 | lddNDe_q2 | population_q3 | O,P,Q. Public administration, education and he... | ssbCCM_q1 | mtbNDi_q3 | sscCCo_q3 | 
| 35 | ltcAre_q2 | lcdMes_q2 | linPDE_q3 | ldeAre_q1 | ldePer_q1 | stbCeA_q3 | mtbNDi_q1 | listed_counts | sdsSWD_q3 | ltbIBD_q2 | linWID_q1 | night_lights_q3 | sdcLAL_q3 | ltcAre_q1 | sicCAR_q1 | sssLin_q3 | 
| 36 | mean_q1 | stcOri_q3 | O,P,Q. Public administration, education and he... | lcnClo_q3 | population_q1 | ssbCCD_q2 | ldsCDL_q3 | night_lights_q2 | ssbCCo_q2 | night_lights_q2 | sisBpM_q2 | population_q2 | ldsCDL_q1 | G, I. Distribution, hotels and restaurants_q1 | A, B, D, E. Agriculture, energy and water_q2 | sdsSWD_q1 | 
| 37 | sscERI_q1 | ssbCCM_q3 | sicCAR_q2 | sdbAre_q3 | ldeAre_q1 | sdsAre_q2 | mean_q2 | ldeAre_q1 | sdbAre_q1 | ltbIBD_q3 | ssbSqu_q2 | R, S, T, U. Other_q1 | sdsSPW_q3 | ssbElo_q2 | O,P,Q. Public administration, education and he... | K, L, M, N. Financial, real estate, profession... | 
| 38 | night_lights_q2 | sisBpM_q1 | sdsSPW_q2 | ssbCCo_q2 | night_lights_q1 | stbSAl_q1 | sdsAre_q3 | sdsSPW_q1 | mtdMDi_q3 | stbCeA_q2 | mtdMDi_q2 | Code_18_111_q2 | ssbERI_q1 | K, L, M, N. Financial, real estate, profession... | H, J. Transport and communication_q1 | lseERI_q3 | 
| 39 | sscCCo_q3 | ssbElo_q3 | stbCeA_q3 | sdbPer_q2 | ssbSqu_q2 | ssbElo_q1 | mdcAre_q1 | sdsAre_q3 | sdcLAL_q2 | sdsAre_q2 | F. Construction_q1 | ssbSqu_q1 | lisCel_q2 | sdbAre_q1 | sdbAre_q3 | G, I. Distribution, hotels and restaurants_q3 | 
| 40 | ltbIBD_q3 | ltcRea_q2 | linP4W_q1 | lisCel_q2 | G, I. Distribution, hotels and restaurants_q1 | mdcAre_q3 | sdsSPO_q1 | stbOri_q2 | linWID_q1 | mtbNDi_q2 | ssbERI_q2 | ssbCCo_q1 | ssbSqu_q3 | ssbERI_q3 | sdbPer_q1 | R, S, T, U. Other_q3 | 
| 41 | stbCeA_q1 | ssbElo_q2 | night_lights_q1 | ssbCCo_q1 | lcnClo_q2 | lseCWA_q2 | ltbIBD_q1 | stcOri_q3 | misCel_q2 | mean_q2 | lddNDe_q3 | sscERI_q2 | mtcWNe_q2 | listed_counts | lcnClo_q3 | ssbElo_q2 | 
| 42 | mtbNDi_q1 | ltcWRE_q1 | stbSAl_q2 | ssbElo_q2 | ssbCCo_q1 | ssbSqu_q2 | sdcLAL_q3 | sddAre_q3 | stcOri_q3 | sscCCo_q3 | lddNDe_q1 | mean_q3 | sdbAre_q3 | lcnClo_q1 | sicCAR_q2 | F. Construction_q2 | 
| 43 | sisBpM_q1 | stbSAl_q1 | G, I. Distribution, hotels and restaurants_q2 | mdcAre_q2 | F. Construction_q2 | ssbElo_q3 | Code_18_231_q3 | ssbElo_q3 | sssLin_q2 | sdbPer_q3 | mdsAre_q1 | night_lights_q1 | lseCCo_q3 | sdbPer_q1 | mtcWNe_q3 | lieWCe_q1 | 
| 44 | mean_q2 | mtdMDi_q1 | night_lights_q2 | night_lights_q1 | K, L, M, N. Financial, real estate, profession... | mdsAre_q3 | K, L, M, N. Financial, real estate, profession... | sdsSWD_q2 | misCel_q1 | lddNDe_q1 | sdbPer_q3 | ssbCCo_q3 | misCel_q1 | sisBpM_q3 | ssbERI_q1 | O,P,Q. Public administration, education and he... | 
| 45 | linWID_q2 | stbOri_q3 | ssbCCo_q2 | ssbElo_q1 | R, S, T, U. Other_q1 | population_q3 | lcnClo_q1 | linWID_q1 | stcOri_q2 | sdbPer_q2 | sscCCo_q3 | culture_nearest | nearest_water | lcdMes_q1 | sssLin_q3 | ldsAre_q3 | 
| 46 | Code_18_211_q2 | ltcWRE_q2 | ldePer_q2 | lteOri_q3 | H, J. Transport and communication_q2 | ssbCCo_q3 | mdcAre_q3 | linP4W_q3 | sdbPer_q1 | ldsAre_q3 | population_q3 | sdsSWD_q2 | ldsCDL_q3 | night_lights_q2 | ldsAre_q1 | F. Construction_q3 | 
| 47 | ltcWRE_q2 | sssLin_q2 | lcnClo_q3 | lcnClo_q2 | C. Manufacturing_q2 | ldsMSL_q1 | ldeAre_q3 | stcOri_q2 | lisCel_q1 | ssbElo_q1 | ssbSqu_q1 | F. Construction_q2 | stcOri_q2 | sicCAR_q2 | mtbNDi_q2 | ltbIBD_q3 | 
| 48 | mtcWNe_q1 | ldeAre_q1 | sdcLAL_q3 | sisBpM_q1 | lcnClo_q1 | lddNDe_q1 | ldePer_q3 | sdsSWD_q1 | sisBpM_q2 | mtcWNe_q1 | sscERI_q3 | G, I. Distribution, hotels and restaurants_q1 | ssbCCD_q2 | O,P,Q. Public administration, education and he... | misCel_q2 | sicCAR_q3 | 
| 49 | ldsAre_q3 | lcdMes_q1 | ltcWRE_q2 | stbSAl_q2 | ldeAre_q2 | misCel_q1 | ldeAre_q2 | linPDE_q3 | stbOri_q2 | sdsAre_q1 | lteOri_q3 | mtbAli_q2 | C. Manufacturing_q3 | population_q2 | sdbCoA_q3 | sdbCoA_q3 | 
imps.to_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/per_cluster_importance.pq")
ims = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/per_cluster_importance.pq")
names.columns
Index(['cluster_4', 'cluster_0', 'cluster_6', 'cluster_1', 'cluster_21',
       'cluster_7', 'cluster_3', 'cluster_5', 'cluster_90', 'cluster_20',
       'cluster_8', 'cluster_22', 'cluster_92', 'cluster_94', 'cluster_91',
       'cluster_95'],
      dtype='object')
n_chars = 10
names = ims[[c for c in ims.columns if "_vals" not in c]].head(n_chars)
values = ims[[c for c in ims.columns if "_vals" in c]].head(n_chars)
coded = {
    'population': 'func_population',
    'night_lights': 'func_night_lights',
    'A, B, D, E. Agriculture, energy and water': 'func_workplace_abde',
    'C. Manufacturing': 'func_workplace_c',
    'F. Construction': 'func_workplace_f',
    'G, I. Distribution, hotels and restaurants': 'func_workplace_gi',
    'H, J. Transport and communication': 'func_workplace_hj',
    'K, L, M, N. Financial, real estate, professional and administrative activities': 'func_workplace_klmn',
    'O,P,Q. Public administration, education and health': 'func_workplace_opq',
    'R, S, T, U. Other': 'func_workplace_rstu',
    'Code_18_124': 'func_corine_124',
    'Code_18_211': 'func_corine_211',
    'Code_18_121': 'func_corine_121',
    'Code_18_421': 'func_corine_421',
    'Code_18_522': 'func_corine_522',
    'Code_18_142': 'func_corine_142',
    'Code_18_141': 'func_corine_141',
    'Code_18_112': 'func_corine_112',
    'Code_18_231': 'func_corine_231',
    'Code_18_311': 'func_corine_311',
    'Code_18_131': 'func_corine_131',
    'Code_18_123': 'func_corine_123',
    'Code_18_122': 'func_corine_122',
    'Code_18_512': 'func_corine_512',
    'Code_18_243': 'func_corine_243',
    'Code_18_313': 'func_corine_313',
    'Code_18_412': 'func_corine_412',
    'Code_18_321': 'func_corine_321',
    'Code_18_322': 'func_corine_322',
    'Code_18_324': 'func_corine_324',
    'Code_18_111': 'func_corine_111',
    'Code_18_423': 'func_corine_423',
    'Code_18_523': 'func_corine_523',
    'Code_18_312': 'func_corine_312',
    'Code_18_133': 'func_corine_133',
    'Code_18_333': 'func_corine_333',
    'Code_18_332': 'func_corine_332',
    'Code_18_411': 'func_corine_411',
    'Code_18_132': 'func_corine_132',
    'Code_18_222': 'func_corine_222',
    'Code_18_242': 'func_corine_242',
    'Code_18_331': 'func_corine_331',
    'Code_18_511': 'func_corine_511',
    'Code_18_334': 'func_corine_334',
    'Code_18_244': 'func_corine_244',
    'Code_18_521': 'func_corine_521',
    'mean': 'func_ndvi',
    'supermarkets_nearest': 'func_supermarkets_nearest',
    'supermarkets_counts': 'func_supermarkets_counts',
    'listed_nearest': 'func_listed_nearest',
    'listed_counts': 'func_listed_counts',
    'fhrs_nearest': 'func_fhrs_nearest',
    'fhrs_counts': 'func_fhrs_counts',
    'culture_nearest': 'func_culture_nearest',
    'culture_counts': 'func_culture_counts',
    'nearest_water': 'func_water_nearest',
    'nearest_retail_centre': 'func_retail_centrenearest',
    'sdbAre': 'form_sdbAre',
    'sdbPer': 'form_sdbPer',
    'sdbCoA': 'form_sdbCoA',
    'ssbCCo': 'form_ssbCCo',
    'ssbCor': 'form_ssbCor',
    'ssbSqu': 'form_ssbSqu',
    'ssbERI': 'form_ssbERI',
    'ssbElo': 'form_ssbElo',
    'ssbCCM': 'form_ssbCCM',
    'ssbCCD': 'form_ssbCCD',
    'stbOri': 'form_stbOri',
    'sdcLAL': 'form_sdcLAL',
    'sdcAre': 'form_sdcAre',
    'sscCCo': 'form_sscCCo',
    'sscERI': 'form_sscERI',
    'stcOri': 'form_stcOri',
    'sicCAR': 'form_sicCAR',
    'stbCeA': 'form_stbCeA',
    'mtbAli': 'form_mtbAli',
    'mtbNDi': 'form_mtbNDi',
    'mtcWNe': 'form_mtcWNe',
    'mdcAre': 'form_mdcAre',
    'ltcWRE': 'form_ltcWRE',
    'ltbIBD': 'form_ltbIBD',
    'sdsSPW': 'form_sdsSPW',
    'sdsSWD': 'form_sdsSWD',
    'sdsSPO': 'form_sdsSPO',
    'sdsLen': 'form_sdsLen',
    'sssLin': 'form_sssLin',
    'ldsMSL': 'form_ldsMSL',
    'mtdDeg': 'form_mtdDeg',
    'lcdMes': 'form_lcdMes',
    'linP3W': 'form_linP3W',
    'linP4W': 'form_linP4W',
    'linPDE': 'form_linPDE',
    'lcnClo': 'form_lcnClo',
    'ldsCDL': 'form_ldsCDL',
    'xcnSCl': 'form_xcnSCl',
    'mtdMDi': 'form_mtdMDi',
    'lddNDe': 'form_lddNDe',
    'linWID': 'form_linWID',
    'stbSAl': 'form_stbSAl',
    'sddAre': 'form_sddAre',
    'sdsAre': 'form_sdsAre',
    'sisBpM': 'form_sisBpM',
    'misCel': 'form_misCel',
    'mdsAre': 'form_mdsAre',
    'lisCel': 'form_lisCel',
    'ldsAre': 'form_ldsAre',
    'ltcRea': 'form_ltcRea',
    'ltcAre': 'form_ltcAre',
    'ldeAre': 'form_ldeAre',
    'ldePer': 'form_ldePer',
    'lseCCo': 'form_lseCCo',
    'lseERI': 'form_lseERI',
    'lseCWA': 'form_lseCWA',
    'lteOri': 'form_lteOri',
    'lteWNB': 'form_lteWNB',
    'lieWCe': 'form_lieWCe',
}
types = {
    0: "Countryside agriculture",
    1: "Accessible suburbia",
    3: "Open sprawl",
    4: "Wild countryside",
    5: "Warehouse/Park land",
    6: "Gridded residential quarters",
    7: "Urban buffer",
    8: "Disconnected suburbia",
    20: "Dense residential neighbourhoods",
    21: "Connected residential neighbourhoods",
    22: "Dense urban neighbourhoods",
    90: "Local urbanity",
    91: "Concentrated urbanity",
    92: "Regional urbanity",
    94: "Metropolitan urbanity",
    95: "Hyper concentrated urbanity",
    93: "outlier",
    96: "outlier",
    97: "outlier",
    98: "outlier",
}
def cmap(name):
    if "_q" in name:
        name = name[:-3]
    if coded[name][:4] == "form":
        return ugg.COLORS[1]
    if coded[name][:4] == "func":
        return ugg.COLORS[4]
    raise ValueError()
x = np.repeat(np.arange(0, 16), n_chars)
y = np.tile(np.arange(0, n_chars), 16) * - 1
colors = names.applymap(cmap).values.T.flatten()
alpha = values.values.T.flatten() / values.values.T.flatten().max()
ticks = [types[int(c[8:])] for c in names.columns]
fig, ax = plt.subplots(figsize=(16, n_chars))
ax.scatter(x, y, alpha=alpha, color=colors, marker="s", s=2500)
plt.tight_layout()
# ax.set_axis_off()
plt.xticks(np.arange(0, 16), ticks, rotation='vertical')
plt.yticks([0, -9], ["top predictor", "10th predictor"])
sns.despine(left=True, bottom=True)
# plt.savefig("figs/feature_imp_10.pdf")
