Runtime Benchmark (MNIST)
This notebook compares compute costs of UMAP (=\(k\)-NN), the exact \(k\)-MST (=KDTree-based boruvka) and approximate \(k\)-MST (=NNDescent-based boruvka) algorithms. The dataset samples and generated graphs are stored for re-analysis and visualization. On MNIST, the approximate \(k\)-MST is roughly two orders of magnitude faster than the exact \(k\)-MST algorithm!
[1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import save_npz
from sklearn.datasets import fetch_openml
from sklearn.utils.random import sample_without_replacement
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from umap import UMAP
from multi_mst import KMST, KMSTDescent
[2]:
# Trigger numba compilation
_ = KMSTDescent().fit(np.random.rand(100, 2))
_ = KMST().fit(np.random.rand(100, 2))
_ = UMAP(force_approximation_algorithm=True, transform_mode="graph").fit_transform(
np.random.rand(100, 2)
)
Timed algorithms
Implement parameter sweep, output logging, and timing code.
[3]:
def time_task(task, *args, **kwargs):
"""Outputs compute time in seconds."""
start_time = time.perf_counter()
result = task(*args, **kwargs)
end_time = time.perf_counter()
return end_time - start_time, result
[4]:
def run_dmst(data, n_neighbors):
mst = KMSTDescent(num_neighbors=n_neighbors)
compute_time, umap = time_task(lambda: mst.fit(data).umap(transform_mode="graph"))
return compute_time, umap.graph_
def run_kmst(data, n_neighbors):
mst = KMST(num_neighbors=n_neighbors)
compute_time, umap = time_task(lambda: mst.fit(data).umap(transform_mode="graph"))
return compute_time, umap.graph_
def run_umap(data, n_neighbors):
umap = UMAP(n_neighbors=n_neighbors, transform_mode="graph")
compute_time, umap = time_task(lambda: umap.fit(data))
return compute_time, umap.graph_
mains = {"dmst": run_dmst, "kmst": run_kmst, "umap": run_umap}
def run(data, algorithm, n_neighbors):
return mains[algorithm](data, n_neighbors)
[5]:
def compute_and_evaluate_setting(
data,
algorithm="dmst",
repeat=0,
frac=1.0,
n_neighbors=5,
):
compute_time, graph = run(data, algorithm, n_neighbors)
save_npz(
f"./data/generated/mnist/graph_{algorithm}_{n_neighbors}_{frac}_{repeat}.npz",
graph.tocoo(),
)
return (
algorithm,
frac,
n_neighbors,
repeat,
graph.nnz,
compute_time,
)
[6]:
def init_file(path):
handle = open(path, "w", buffering=1)
handle.write(
"algorithm,sample_fraction,n_neighbors,repeat,num_edges,compute_time\n"
)
return handle
def write_line(handle, *args):
handle.write(",".join([str(v) for v in args]) + "\n")
[7]:
repeats = 5
algorithms = ['dmst', 'umap', 'kmst']
fraction = np.exp(np.linspace(np.log(0.1), np.log(1), 5)).round(2)
n_neighbors = [2, 3, 6]
total = len(algorithms) * len(fraction) * len(n_neighbors)
[8]:
fraction * 70000
[8]:
array([ 7000., 12600., 22400., 39200., 70000.])
[5]:
df, target = fetch_openml("mnist_784", version=1, return_X_y=True)
df.shape
[5]:
(70000, 784)
[10]:
output = init_file("./data/generated/mnist/metrics.csv")
for repeat in tqdm(range(repeats)):
pbar = tqdm(desc="Compute", total=total, leave=False)
for frac in fraction:
sample_idx = sample_without_replacement(df.shape[0], int(df.shape[0] * frac))
np.save(f"./data/generated/mnist/sampled_indices_{frac}_{repeat}.npy", sample_idx)
X = df.iloc[sample_idx, :]
for algorithm in algorithms:
for k in n_neighbors:
result = compute_and_evaluate_setting(X, algorithm, repeat, frac, k)
write_line(output, *result)
pbar.update()
pbar.close()
output.close()
100%|██████████| 5/5 [5:04:38<00:00, 3655.75s/it]
Results
NNDescent-based \(k\)-MST is more expensive that NNDescent based \(k\)-NN. Scaling appears a bit steeper but still usable. Definately a lot quicker than KDTree-based MSTs!
[2]:
import seaborn as sns
import matplotlib.lines as ml
from lib.plotting import *
configure_matplotlib()
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
[3]:
values = pd.read_csv("./data/generated/mnist/metrics.csv")
[6]:
from matplotlib.ticker import FixedLocator
# Fit robust linear regression in log-log space
fig = sized_fig(1/2)
ax = plt.gca()
for i, alg in enumerate(["kmst", "dmst", "umap"]):
alg_values = values[values["algorithm"] == alg]
sns.regplot(
x=np.log10(alg_values["sample_fraction"] * df.shape[0]),
y=np.log10(alg_values["compute_time"]),
ci=95,
order=1,
robust=True,
color=f"C{i}",
units=alg_values["repeat"],
scatter_kws={"edgecolor": "none", "linewidths": 0, "s": 2},
line_kws={"linewidth": 1},
ax=ax,
)
ax.set_xlabel("Num. MNIST points")
ax.set_ylabel("Run time (s)")
# Draw log y-ticks
y_ticks = np.array([0.0, 1.0, 2.0, 3.0])
plt.ylim(-1, plt.ylim()[1])
ax.set_yticks(y_ticks)
ax.get_yaxis().set_major_formatter(lambda x, pos: f"$10^{{{int(x)}}}$")
ax.get_yaxis().set_minor_locator(
FixedLocator(locs=np.concat((
np.log10(np.arange(0.2, 1, 0.1) * 10.0 ** y_ticks[0]),
np.log10(np.arange(2, 10) * 10.0 ** y_ticks[None].T).ravel())
))
)
# Draw log x-ticks
x_ticks = np.array([4.0])
ax.set_xticks(x_ticks)
ax.get_xaxis().set_major_formatter(lambda x, pos: f"$10^{{{int(x)}}}$")
ax.get_xaxis().set_minor_locator(
FixedLocator(locs=np.log10(np.array(
[0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, 70, 80]
) * 10.0 ** x_ticks[None].T).ravel())
)
plt.xlim(np.log10([6000, 80000]))
# Legend
adjust_legend_subtitles(
plt.legend(
loc="upper left",
handles=[
ml.Line2D([], [], color=f"C{j}", label=f"{v}")
for j, v in enumerate(['$k$-MST (kd-tree)', '$k$-MST (descent)', '$k$-NN'])
]
)
)
plt.subplots_adjust(left=0.17, right=0.9, top=0.95, bottom=0.24)
plt.savefig("./images/mnist_scaling.pdf", pad_inches=0)
plt.show()
![_images/Benchmark_MNIST_15_0.png](_images/Benchmark_MNIST_15_0.png)