Skip to content

[BUG] UMAP transform accuracy. #3864

Open
@trivialfis

Description

@trivialfis

I tried to compare the results between CPU UMAP and GPU UMAP with fashion mnist dataset, it seems the CPU implementation is more accuracy from a visualization point. The comparison is made between branch-0.20 of cuml and 0.5.1 of CPU UMAP, both are run with seed:

CPU GPU
transform-cpu transform-gpu

Sample code:

import os
import gzip
import numpy as np
import cuml
from bokeh.plotting import figure, output_file, show
from bokeh.models import CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Category10
import umap

def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels


X, y = load_mnist("fashion-mnist/data/fashion")

n_epochs = None
model = cuml.manifold.UMAP(random_state=1994)
# Use CPU or GPU
# model = umap.UMAP(random_state=1994)

embedding = model.fit_transform(X)
output_file("fashion.html")


def plot_fit():
    targets = [str(d) for d in range(10)]

    source = ColumnDataSource(
        dict(
            x=[e[0] for e in embedding],
            y=[e[1] for e in embedding],
            label=[targets[d] for d in y],
        )
    )

    cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

    p = figure(title="test umap")
    p.circle(
        x="x",
        y="y",
        source=source,
        color={"field": "label", "transform": cmap},
        legend="label",
    )

    show(p)


def plot_transform(model):
    n = X.shape[0] // 2
    transformed = model.transform(X[:n])
    targets = [str(d) for d in range(10)]
    labels = y[:n]

    source = ColumnDataSource(
        dict(
            x=[e[0] for e in transformed],
            y=[e[1] for e in transformed],
            label=[targets[d] for d in labels],
        )
    )

    cmap = CategoricalColorMapper(factors=targets, palette=Category10[10])

    p = figure(title="test umap")
    p.circle(
        x="x",
        y="y",
        source=source,
        color={"field": "label", "transform": cmap},
        legend="label",
    )
    print("Show transformed")
    show(p)


plot_fit()
plot_transform(model)

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions