Parameter sensitivity
Are k-MSTs less sensitive to k than k-NNs? Intuitively the answer is yes because k-MSTs always produce a single connected component for every value of k. That is not always the case for k-NNs.
A better question asks how k influences dimensionality reduction performance. This notebook performs a parameter sensitivity analysis measuring how the sortedness metric changes for small changes in k: \(\Delta k\)
[ ]:
import numba
import warnings
import numpy as np
import pandas as pd
import lensed_umap as lu
from tqdm import tqdm
from itertools import product
from collections import defaultdict
from scipy.stats import weightedtau
from scipy.spatial.distance import cdist
from umap import UMAP
from multi_mst import KMSTDescent
from lib.data import get_datasets
from lib.plotting import *
_ = configure_matplotlib()
Datasets
We use a set of smaller real-world datasets to evaluate the algorithms because the sortedness measure has a quadratic computational complexity. These datasets have been used to evaluate clustering algorithms by campello et al., 2015 and Castro et al., 2019.
This repository includes code and/or instructions to download the datasets. See the respective notebooks/data/[dataset]/README.md file for instructions. The datasets that have been downloaded and/or pre-processed following those instructions are used.
[2]:
small_data_config = {
key: loader
for key, loader in get_datasets().items()
if loader().shape[0] < 10000
}
print(small_data_config.keys())
dict_keys(['iris', 'diabetes', 'wine', 'articles_1442_5', 'articles_1442_80', 'authorship', 'cardiotocography', 'cell_cycle_237', 'ecoli', 'elegans', 'horse', 'mfeat_factors', 'mfeat_karhunen', 'semeion', 'yeast_galactose'])
Parameter sweep
This section implements the parameter sensitivity analysis. First, we need a function to compute k-NN and k-MST graphs and their UMAP embeddings given a dataset and k.
[4]:
def compute_embedding(data, k, alg):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=FutureWarning)
if alg == "kmst":
p = (
KMSTDescent(num_neighbors=k, metric="cosine")
.fit(data)
.umap(transform_mode="graph")
)
elif alg == "umap":
p = UMAP(
n_neighbors=k, force_approximation_algorithm=True, metric="cosine"
).fit(data, transform_mode="graph")
p = lu.embed_graph(p, repulsion_strengths=[0.001, 0.01, 0.1, 1])
return p.embedding_, p.graph_
Second, we need a function that computes the sortedness measure, comparing distance ranks between the raw data and the embedding:
[5]:
@numba.njit(parallel=True)
def rank_along_col(orders):
"""Convert per-row argsort order into ranks."""
out = np.empty(orders.shape, dtype=np.intp)
for row in numba.prange(orders.shape[0]):
for i, o in enumerate(orders[row, :]):
out[row, o] = i
return out
def score_ranks(a, b):
"""Computes Sortedness from data ranks (a) and embedding ranks (b)."""
sortedness = np.empty(a.shape[0], dtype=np.float32)
for idx in range(a.shape[0]):
sortedness[idx] = weightedtau(a[idx, :], b[idx, :], rank=a[idx, :]).statistic
return np.mean(sortedness)
Finally, we need to evaluate both algorithms on the datasets at multiple values of k. Here, we select random values in 10 uniform segments on \(k = [2, 100]\). Each sampled value also gets perturbed with \(\pm \Delta k \in [2, 5, 10]\), randomly selecting the positive or negative direction. The resulting unique values for \(k\) are evaluated and the process is repeated 5 times. The embedding, graph, and resulting sortedness are stored.
Running this cell took 16 hours.
[8]:
n_repeats = 5
num_segments = 10
min_sample_range = (2, 100)
deltas = np.array([[2], [5], [10]])
algorithms = ["kmst", "umap"]
# sampled k values
segments = np.linspace(*min_sample_range, num_segments + 1, dtype=int)
samples = np.random.uniform(segments[:-1], segments[1:], size=(n_repeats, num_segments))
min_sample_sizes = np.round(samples).astype(int).T
# perturbed k values
directions = np.random.choice((-1, 1), size=(deltas.shape[0], num_segments, n_repeats))
perturbations = directions * deltas[:, :, np.newaxis]
new_values = min_sample_sizes[np.newaxis, :, :] + perturbations
# parameter sweep
records = []
pbar = tqdm(total=len(small_data_config) * len(algorithms) * n_repeats)
for data_name, loader in small_data_config.items():
X = loader()
data_ranks = rank_along_col(np.argsort(cdist(X, X, metric="cosine"), axis=1))
for r in range(n_repeats):
all_ks = np.unique(
np.concatenate((min_sample_sizes[:, r], new_values[:, :, r].flatten()))
)
all_ks = all_ks[all_ks >= 2]
for alg in algorithms:
for k in all_ks:
E, g = compute_embedding(X, k, alg)
embedding_ranks = rank_along_col(
np.argsort(cdist(E, E, metric="sqeuclidean"), axis=1)
)
sortedness = score_ranks(data_ranks, embedding_ranks)
records.append(
{
"repeat": r,
"data_set": data_name,
"algorithm": alg,
"k": k,
"embedding": E,
"graph": g,
"sortedness": sortedness,
}
)
pbar.update(1)
df = pd.DataFrame.from_records(records)
np.save("./data/generated/sensitivity/deltas.npy", deltas)
np.save("./data/generated/sensitivity/min_sizes.npy", min_sample_sizes)
np.save("./data/generated/sensitivity/perturbed_values.npy", new_values)
df.to_pickle("./data/generated/sensitivity/results.pkl")
0%| | 0/150 [01:14<?, ?it/s]
100%|██████████| 150/150 [16:13:20<00:00, 24.61s/it]
Results
Now we can load the results and create plots to see how the k-NNs and k-MST differ over k.
[3]:
import numpy as np
import pandas as pd
from lib.plotting import *
from lib.drawing import regplot_lowess_ci
tab10 = configure_matplotlib()
deltas = np.load("./data/generated/sensitivity/deltas.npy")
min_sample_sizes = np.load("./data/generated/sensitivity/min_sizes.npy")
new_values = np.load("./data/generated/sensitivity/perturbed_values.npy")
df = pd.read_pickle("./data/generated/sensitivity/results.pkl")
ks = sorted(df.k.unique())
algorithms = df.algorithm.unique()
datasets = sorted(df.data_set.unique())
[4]:
print(datasets)
['articles_1442_5', 'articles_1442_80', 'authorship', 'cardiotocography', 'cell_cycle_237', 'diabetes', 'ecoli', 'elegans', 'horse', 'iris', 'mfeat_factors', 'mfeat_karhunen', 'semeion', 'wine', 'yeast_galactose']
We use nicely formatted names in the plots:
[6]:
def to_display_name(input):
if input.startswith("k"):
return f"$k$-{input[1:].upper()}"
if input == 'umap':
return "$k$-NN"
return input.upper()
def dataset_name(input):
names = dict(
articles_1442_5="Articles-1442-5",
articles_1442_80="Articles-1442-80",
audioset="AudioSet (music)",
authorship="Authorship",
cardiotocography="CTG",
cell_cycle_237="CellCycle-237",
cifar_10="CIFAR-10",
diabetes="Diabetes",
ecoli="E.~Coli",
elegans="C.~Elegans",
fashion_mnist="Fashion-MNIST",
horse="Horse",
iris="Iris",
mfeat_factors="Mfeat-Factors",
mfeat_karhunen="Mfeat-Karhunen",
mnist="MNIST",
newsgroups="20 Newsgroups",
semeion="Semeion Digits",
wine="Wine",
yeast_galactose="YeastGalactose",
)
return names.get(input, input)
Sortedness over \(k\) per dataset
This section draws the average sortedness over k for all datasets. Generally, we see stable performance on both algorithms. The k-NN requires a slightly larger k-value to reach its performance peak. The k-MST performs well on very low k values. On the other hand, its performance drops at higher k-values.
[16]:
for i, data_name in enumerate(datasets):
sized_fig(0.5, 0.618 / 2)
ddf = df.query(f'data_set == "{data_name}"')
max_x = ddf.k.max()
max_y = ddf.sortedness.max()
sns.lineplot(
data=ddf,
x="k",
y="sortedness",
hue="algorithm",
errorbar=None,
palette="tab10",
legend=False,
)
plt.title(dataset_name(data_name), fontsize=fontsize['small'])
plt.ylabel(r"$\lambda_{\tau_w}$")
plt.xlabel("$k$", labelpad=0)
plt.ylim(0, 1.05)
plt.yticks([0, 0.5, 1.0])
plt.subplots_adjust(bottom=0.38, right=1.01, left=0.16, top=0.78)
plt.savefig(f"images/sortedness_curves_{data_name}_umap_layout.pdf", pad_inches=0)
plt.show()
Interpolated curve shape:
To summarize these figures we Lowess interpolate k to the sortedness values considering all datasets. First, we scale the sortedness values by the maximum value observed per dataset so the result captures the curves’ shapes while ignoring absolute performance.
[19]:
def scale_quality(group):
max_q = group.sortedness.max()
group["scaled_quality"] = group.sortedness / max_q
return group
df = df.groupby(["data_set"]).apply(scale_quality, include_groups=False).reset_index()
The resulting figure confirms the previously mentioned patterns: k-MST performs better than k-NN at low k-values (k < 10).
[99]:
sized_fig(1, 0.618 / 2)
max_y = df.scaled_quality.max()
plt.title("Interpolated curves (scaled)")
plt.axhline(y=1, xmin=0, xmax=100, color="k", linestyle=":", linewidth=1)
for j, alg in enumerate(algorithms):
regplot_lowess_ci(
df.query(f'algorithm == "{alg}"'),
x="k",
y="scaled_quality",
ci_level=95,
n_boot=100,
lowess_frac=0.05,
color=tab10[j],
scatter=False,
)
plt.legend(
loc="lower right",
bbox_to_anchor=(0.965, 0),
handles=[
plt.Line2D([0], [0], color=tab10[i], lw=1, label=to_display_name(alg))
for i, alg in enumerate(algorithms)
],
)
plt.ylabel(r"Scaled $\lambda_{\tau_w}$")
plt.xlabel("$k$", labelpad=0)
plt.ylim(0, 1.05)
plt.yticks([])
plt.subplots_adjust(bottom=0.2, right=1.02, left=0.09, top=0.88)
plt.savefig(f"images/sortedness_curves_scaled_umap_layout.pdf", pad_inches=0)
plt.show()
Second, we perform the same interpolation without re-scaling the sortedness. These curves show (local) average sortedness over all datasets. Here, the performance difference at higher k-values (k > 50) is also visible.
[100]:
sized_fig(1, 0.618 / 2)
max_y = df.sortedness.max()
plt.title("Interpolated curves")
for j, alg in enumerate(algorithms):
regplot_lowess_ci(
df.query(f'algorithm == "{alg}"'),
x="k",
y="sortedness",
ci_level=95,
n_boot=100,
lowess_frac=0.05,
color=tab10[j],
scatter=False,
)
plt.legend(
loc="lower right",
bbox_to_anchor=(0.965, 0),
handles=[
plt.Line2D([0], [0], color=tab10[i], lw=1, label=to_display_name(alg))
for i, alg in enumerate(algorithms)
],
)
plt.ylabel(r"$\lambda_{\tau_w}$")
plt.xlabel("$k$", labelpad=0)
plt.ylim(0, 1.05)
plt.yticks([0, 0.5, 1.0])
plt.subplots_adjust(bottom=0.2, right=1.02, left=0.09, top=0.88)
plt.savefig(f"images/sortedness_curves_interpolated_umap_layout.pdf", pad_inches=0)
plt.show()
Sensitivity boxplots
The sensitivity analysis computes an average absolute ratio comparing the original and perturbed k values:
where \(V\) is the set of sampled k-values.
This cell computes the sensitivity scores for each dataset.
[ ]:
sensitivity_records = []
for data_set, alg_id in product(datasets, algorithms):
sub_df = df.query(f"data_set == '{data_set}' & algorithm == '{alg}'")
vals = defaultdict(
lambda: np.nan,
{(r, k): v for r, k, v in zip(sub_df.repeat, sub_df.k, sub_df.sortedness)},
)
lookup_fun = np.vectorize(lambda r, x: vals[(r, x)])
initial_val = np.hstack(
[lookup_fun(i, min_sample_sizes[:, i])[:, None] for i in range(n_repeats)]
)
perturbed_val = np.concatenate(
[lookup_fun(i, new_values[:, :, i])[:, :, None] for i in range(n_repeats)],
axis=2,
)
with np.errstate(divide="ignore", invalid="ignore"):
diff = np.abs((perturbed_val - initial_val) / (perturbed_val + initial_val))
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
sensitivity = np.nanmean(diff, axis=(1, 2))
for delta, sensitivity in zip(deltas[:, 0], sensitivity):
sensitivity_records.append(
{
"data_set": data_set,
"alg_id": alg_id,
"perturbation": delta,
"sensitivity": sensitivity,
}
)
# Convert to pandas
df_sens = pd.DataFrame.from_records(sensitivity_records)
df_sens.head()
| data_set | alg_id | perturbation | sensitivity | |
|---|---|---|---|---|
| 0 | articles_1442_5 | kmst | 2 | 0.006424 |
| 1 | articles_1442_5 | kmst | 5 | 0.007088 |
| 2 | articles_1442_5 | kmst | 10 | 0.011807 |
| 3 | articles_1442_5 | umap | 2 | 0.006424 |
| 4 | articles_1442_5 | umap | 5 | 0.007088 |
There is no visible difference between the sensitivity distributions because the sortedness curves are very flat except at k<10.
[80]:
sized_fig(1, 0.8 / 3)
ax = sns.boxplot(
data=df_sens,
y=df_sens.sensitivity,
x=pd.Categorical(df_sens.perturbation),
hue=df_sens.alg_id.apply(to_display_name),
palette=tab10[:2],
fliersize=0.5,
linewidth=1,
legend=True,
)
max_y = 0.03
df_sens.sensitivity > max_y
u_perts = df_sens.perturbation.unique().tolist()
for (alg_id, perturbation), _ in (
df_sens[df_sens.sensitivity > max_y]
.groupby(["alg_id", "perturbation"])
.groups.items()
):
pert_order = u_perts.index(perturbation)
alg_order = algorithms.tolist().index(alg_id)
w = 0.8/2
o = pert_order - w + (alg_order) * w / 1 + w / 2
plt.plot(o, max_y - 0.0007, 'k^', markersize=1.5)
# plt.yticks([0, 0.05, 0.1])
plt.ylim(-0.005, max_y)
plt.legend(ncol=4, title="", loc="upper left")
plt.xlabel("Perturbation of $k$")
plt.ylabel("Avg.~sensitivity")
plt.title("Parameter sensitivity distributions")
plt.subplots_adjust(left=0.075, right=1, bottom=0.21, top=0.9)
plt.show()
Sortedness distributions
This cell provides an alternative visualization to the interpolated curves. Here, the sortedness is shown in boxplots for segments of k spanning 5 values. The boxplots contain more information than the mean with its confidence interval.
[96]:
sized_fig(1, 0.8 / 3)
ax = sns.boxplot(
data=df,
y=df.sortedness,
x=pd.Categorical(df.k // 5),
hue=df.algorithm.apply(to_display_name),
palette=tab10[:2],
fliersize=0.5,
linewidth=1,
legend=True,
)
plt.ylim(-0.005, 1.005)
plt.yticks([0, 0.5, 1])
xticks = plt.xticks()[0]
plt.xticks(xticks)
plt.gca().set_xticklabels([f"{(tick + 1) * 5}" for tick in xticks])
plt.legend(ncol=4, title="", loc="lower right")
plt.xlabel("$k$")
plt.ylabel("$\\lambda_{\\tau_w}$")
plt.title("Sortedness distributions over $k$")
plt.subplots_adjust(left=0.08, right=1, bottom=0.27, top=0.87)
plt.savefig(f"images/sortedness_boxplots_umap_layout.pdf", pad_inches=0)
plt.show()
Individual layouts
Finally, we draw the edges of the best layout per dataset for select \(k\) values.
[8]:
from sklearn.decomposition import PCA
from matplotlib.collections import LineCollection
plot_ks = [2, 3, 4, 5, 7, 10]
for data_set in datasets:
sized_fig(1, 0.618 / 6 * 2)
for i, (alg, k) in enumerate(product(algorithms, plot_ks)):
plt.subplot(len(algorithms), len(plot_ks), i + 1)
rows = df.query("data_set == @data_set & k == @k & algorithm == @alg")
idx = np.argmax(rows.sortedness)
embedding = rows.embedding.iloc[idx]
if not np.any(np.isnan(embedding)):
embedding = PCA().fit_transform(embedding)
embedding -= embedding.min(axis=0)
embedding /= embedding.max(axis=0)
embedding *= np.array([[1.618, 1]])
graph = rows.graph.iloc[idx].tocoo()
row, col = graph.row, graph.col
plt.gca().add_collection(
LineCollection(
np.hstack((embedding[row], embedding[col])).reshape(-1, 2, 2),
alpha=0.1,
linewidths=0.1,
color="k",
)
)
plt.xlim(0, 1.668)
plt.ylim(0, 1.05)
plt.xticks([])
plt.yticks([])
frame_off()
if alg == algorithms[0]:
plt.title(f"$k$={k}", fontsize=fontsize["small"])
if k == plot_ks[0]:
plt.ylabel(to_display_name(alg), fontsize=fontsize["small"])
plt.suptitle(dataset_name(data_set), fontsize=fontsize["normal"], y=1, va="top")
plt.subplots_adjust(0.05, 0, 1, 0.7, 0, 0)
plt.savefig(f"images/umap_layouts_{data_set}.png", pad_inches=0)
plt.show()