fix: setuptools configuration

2023-06-20 19:25:35 +02:00
parent 1b5093627e
commit cbbbbeda98
12 changed files with 0 additions and 6 deletions
--- a/src/prototorch/models/init.py
+++ b/src/prototorch/models/init.py
@@ -0,0 +1,39 @@
+"""`models` plugin for the `prototorch` package."""
+
+from .callbacks import PrototypeConvergence, PruneLoserPrototypes
+from .cbc import CBC, ImageCBC
+from .glvq import (
+    GLVQ,
+    GLVQ1,
+    GLVQ21,
+    GMLVQ,
+    GRLVQ,
+    GTLVQ,
+    LGMLVQ,
+    LVQMLN,
+    ImageGLVQ,
+    ImageGMLVQ,
+    ImageGTLVQ,
+    SiameseGLVQ,
+    SiameseGMLVQ,
+    SiameseGTLVQ,
+)
+from .knn import KNN
+from .lvq import (
+    LVQ1,
+    LVQ21,
+    MedianLVQ,
+)
+from .probabilistic import (
+    CELVQ,
+    RSLVQ,
+    SLVQ,
+)
+from .unsupervised import (
+    GrowingNeuralGas,
+    KohonenSOM,
+    NeuralGas,
+)
+from .vis import *
+
+__version__ = "0.6.0"
--- a/src/prototorch/models/abstract.py
+++ b/src/prototorch/models/abstract.py
@@ -0,0 +1,249 @@
+"""Abstract classes to be inherited by prototorch models."""
+
+import logging
+
+import prototorch
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+import torchmetrics
+from prototorch.core.competitions import WTAC
+from prototorch.core.components import (
+    AbstractComponents,
+    Components,
+    LabeledComponents,
+)
+from prototorch.core.distances import euclidean_distance
+from prototorch.core.initializers import (
+    LabelsInitializer,
+    ZerosCompInitializer,
+)
+from prototorch.core.pooling import stratified_min_pooling
+from prototorch.nn.wrappers import LambdaLayer
+
+
+class ProtoTorchBolt(pl.LightningModule):
+    """All ProtoTorch models are ProtoTorch Bolts."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__()
+
+        # Hyperparameters
+        self.save_hyperparameters(hparams)
+
+        # Default hparams
+        self.hparams.setdefault("lr", 0.01)
+
+        # Default config
+        self.optimizer = kwargs.get("optimizer", torch.optim.Adam)
+        self.lr_scheduler = kwargs.get("lr_scheduler", None)
+        self.lr_scheduler_kwargs = kwargs.get("lr_scheduler_kwargs", dict())
+
+    def configure_optimizers(self):
+        optimizer = self.optimizer(self.parameters(), lr=self.hparams["lr"])
+        if self.lr_scheduler is not None:
+            scheduler = self.lr_scheduler(optimizer,
+                                          **self.lr_scheduler_kwargs)
+            sch = {
+                "scheduler": scheduler,
+                "interval": "step",
+            }  # called after each training step
+            return [optimizer], [sch]
+        else:
+            return optimizer
+
+    def reconfigure_optimizers(self):
+        if self.trainer:
+            self.trainer.strategy.setup_optimizers(self.trainer)
+        else:
+            logging.warning("No trainer to reconfigure optimizers!")
+
+    def __repr__(self):
+        surep = super().__repr__()
+        indented = "".join([f"\t{line}\n" for line in surep.splitlines()])
+        wrapped = f"ProtoTorch Bolt(\n{indented})"
+        return wrapped
+
+
+class PrototypeModel(ProtoTorchBolt):
+    proto_layer: AbstractComponents
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        distance_fn = kwargs.get("distance_fn", euclidean_distance)
+        self.distance_layer = LambdaLayer(distance_fn, name="distance_fn")
+
+    @property
+    def num_prototypes(self):
+        return len(self.proto_layer.components)
+
+    @property
+    def prototypes(self):
+        return self.proto_layer.components.detach().cpu()
+
+    @property
+    def components(self):
+        """Only an alias for the prototypes."""
+        return self.prototypes
+
+    def add_prototypes(self, *args, **kwargs):
+        self.proto_layer.add_components(*args, **kwargs)
+        self.hparams["distribution"] = self.proto_layer.distribution
+        self.reconfigure_optimizers()
+
+    def remove_prototypes(self, indices):
+        self.proto_layer.remove_components(indices)
+        self.hparams["distribution"] = self.proto_layer.distribution
+        self.reconfigure_optimizers()
+
+
+class UnsupervisedPrototypeModel(PrototypeModel):
+    proto_layer: Components
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Layers
+        prototypes_initializer = kwargs.get("prototypes_initializer", None)
+        if prototypes_initializer is not None:
+            self.proto_layer = Components(
+                self.hparams["num_prototypes"],
+                initializer=prototypes_initializer,
+            )
+
+    def compute_distances(self, x):
+        protos = self.proto_layer().type_as(x)
+        distances = self.distance_layer(x, protos)
+        return distances
+
+    def forward(self, x):
+        distances = self.compute_distances(x)
+        return distances
+
+
+class SupervisedPrototypeModel(PrototypeModel):
+    proto_layer: LabeledComponents
+
+    def __init__(self, hparams, skip_proto_layer=False, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Layers
+        distribution = hparams.get("distribution", None)
+        prototypes_initializer = kwargs.get("prototypes_initializer", None)
+        labels_initializer = kwargs.get("labels_initializer",
+                                        LabelsInitializer())
+        if not skip_proto_layer:
+            # when subclasses do not need a customized prototype layer
+            if prototypes_initializer is not None:
+                # when building a new model
+                self.proto_layer = LabeledComponents(
+                    distribution=distribution,
+                    components_initializer=prototypes_initializer,
+                    labels_initializer=labels_initializer,
+                )
+                proto_shape = self.proto_layer.components.shape[1:]
+                self.hparams["initialized_proto_shape"] = proto_shape
+            else:
+                # when restoring a checkpointed model
+                self.proto_layer = LabeledComponents(
+                    distribution=distribution,
+                    components_initializer=ZerosCompInitializer(
+                        self.hparams["initialized_proto_shape"]),
+                )
+        self.competition_layer = WTAC()
+
+    @property
+    def prototype_labels(self):
+        return self.proto_layer.labels.detach().cpu()
+
+    @property
+    def num_classes(self):
+        return self.proto_layer.num_classes
+
+    def compute_distances(self, x):
+        protos, _ = self.proto_layer()
+        distances = self.distance_layer(x, protos)
+        return distances
+
+    def forward(self, x):
+        distances = self.compute_distances(x)
+        _, plabels = self.proto_layer()
+        winning = stratified_min_pooling(distances, plabels)
+        y_pred = F.softmin(winning, dim=1)
+        return y_pred
+
+    def predict_from_distances(self, distances):
+        with torch.no_grad():
+            _, plabels = self.proto_layer()
+            y_pred = self.competition_layer(distances, plabels)
+        return y_pred
+
+    def predict(self, x):
+        with torch.no_grad():
+            distances = self.compute_distances(x)
+        y_pred = self.predict_from_distances(distances)
+        return y_pred
+
+    def log_acc(self, distances, targets, tag):
+        preds = self.predict_from_distances(distances)
+        accuracy = torchmetrics.functional.accuracy(
+            preds.int(),
+            targets.int(),
+            "multiclass",
+            num_classes=self.num_classes,
+        )
+
+        self.log(
+            tag,
+            accuracy,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+
+    def test_step(self, batch, batch_idx):
+        x, targets = batch
+
+        preds = self.predict(x)
+        accuracy = torchmetrics.functional.accuracy(
+            preds.int(),
+            targets.int(),
+            "multiclass",
+            num_classes=self.num_classes,
+        )
+
+        self.log("test_acc", accuracy)
+
+
+class ProtoTorchMixin:
+    """All mixins are ProtoTorchMixins."""
+
+
+class NonGradientMixin(ProtoTorchMixin):
+    """Mixin for custom non-gradient optimization."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.automatic_optimization = False
+
+    def training_step(self, train_batch, batch_idx, optimizer_idx=None):
+        raise NotImplementedError
+
+
+class ImagePrototypesMixin(ProtoTorchMixin):
+    """Mixin for models with image prototypes."""
+    proto_layer: Components
+    components: torch.Tensor
+
+    def on_train_batch_end(self, outputs, batch, batch_idx):
+        """Constrain the components to the range [0, 1] by clamping after updates."""
+        self.proto_layer.components.data.clamp_(0.0, 1.0)
+
+    def get_prototype_grid(self, num_columns=2, return_channels_last=True):
+        from torchvision.utils import make_grid
+        grid = make_grid(self.components, nrow=num_columns)
+        if return_channels_last:
+            grid = grid.permute((1, 2, 0))
+        return grid.cpu()
--- a/src/prototorch/models/callbacks.py
+++ b/src/prototorch/models/callbacks.py
@@ -0,0 +1,152 @@
+"""Lightning Callbacks."""
+
+import logging
+from typing import TYPE_CHECKING
+
+import pytorch_lightning as pl
+import torch
+from prototorch.core.initializers import LiteralCompInitializer
+
+from .extras import ConnectionTopology
+
+if TYPE_CHECKING:
+    from prototorch.models import GLVQ, GrowingNeuralGas
+
+
+class PruneLoserPrototypes(pl.Callback):
+
+    def __init__(
+        self,
+        threshold=0.01,
+        idle_epochs=10,
+        prune_quota_per_epoch=-1,
+        frequency=1,
+        replace=False,
+        prototypes_initializer=None,
+        verbose=False,
+    ):
+        self.threshold = threshold  # minimum win ratio
+        self.idle_epochs = idle_epochs  # epochs to wait before pruning
+        self.prune_quota_per_epoch = prune_quota_per_epoch
+        self.frequency = frequency
+        self.replace = replace
+        self.verbose = verbose
+        self.prototypes_initializer = prototypes_initializer
+
+    def on_train_epoch_end(self, trainer, pl_module: "GLVQ"):
+        if (trainer.current_epoch + 1) < self.idle_epochs:
+            return None
+        if (trainer.current_epoch + 1) % self.frequency:
+            return None
+
+        ratios = pl_module.prototype_win_ratios.mean(dim=0)
+        to_prune = torch.arange(len(ratios))[ratios < self.threshold]
+        to_prune = to_prune.tolist()
+        prune_labels = pl_module.prototype_labels[to_prune]
+        if self.prune_quota_per_epoch > 0:
+            to_prune = to_prune[:self.prune_quota_per_epoch]
+            prune_labels = prune_labels[:self.prune_quota_per_epoch]
+
+        if len(to_prune) > 0:
+            logging.debug(f"\nPrototype win ratios: {ratios}")
+            logging.debug(f"Pruning prototypes at: {to_prune}")
+            logging.debug(f"Corresponding labels are: {prune_labels.tolist()}")
+
+            cur_num_protos = pl_module.num_prototypes
+            pl_module.remove_prototypes(indices=to_prune)
+
+            if self.replace:
+                labels, counts = torch.unique(prune_labels,
+                                              sorted=True,
+                                              return_counts=True)
+                distribution = dict(zip(labels.tolist(), counts.tolist()))
+
+                logging.info(f"Re-adding pruned prototypes...")
+                logging.debug(f"distribution={distribution}")
+
+                pl_module.add_prototypes(
+                    distribution=distribution,
+                    components_initializer=self.prototypes_initializer)
+            new_num_protos = pl_module.num_prototypes
+
+            logging.info(f"`num_prototypes` changed from {cur_num_protos} "
+                         f"to {new_num_protos}.")
+        return True
+
+
+class PrototypeConvergence(pl.Callback):
+
+    def __init__(self, min_delta=0.01, idle_epochs=10, verbose=False):
+        self.min_delta = min_delta
+        self.idle_epochs = idle_epochs  # epochs to wait
+        self.verbose = verbose
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        if (trainer.current_epoch + 1) < self.idle_epochs:
+            return None
+
+        logging.info("Stopping...")
+        # TODO
+        return True
+
+
+class GNGCallback(pl.Callback):
+    """GNG Callback.
+
+    Applies growing algorithm based on accumulated error and topology.
+
+    Based on "A Growing Neural Gas Network Learns Topologies" by Bernd Fritzke.
+
+    """
+
+    def __init__(self, reduction=0.1, freq=10):
+        self.reduction = reduction
+        self.freq = freq
+
+    def on_train_epoch_end(
+        self,
+        trainer: pl.Trainer,
+        pl_module: "GrowingNeuralGas",
+    ):
+        if (trainer.current_epoch + 1) % self.freq == 0:
+            # Get information
+            errors = pl_module.errors
+            topology: ConnectionTopology = pl_module.topology_layer
+            components = pl_module.proto_layer.components
+
+            # Insertion point
+            worst = torch.argmax(errors)
+
+            neighbors = topology.get_neighbors(worst)[0]
+
+            if len(neighbors) == 0:
+                logging.log(level=20, msg="No neighbor-pairs found!")
+                return
+
+            neighbors_errors = errors[neighbors]
+            worst_neighbor = neighbors[torch.argmax(neighbors_errors)]
+
+            # New Prototype
+            new_component = 0.5 * (components[worst] +
+                                   components[worst_neighbor])
+
+            # Add component
+            pl_module.proto_layer.add_components(
+                1,
+                initializer=LiteralCompInitializer(new_component.unsqueeze(0)),
+            )
+
+            # Adjust Topology
+            topology.add_prototype()
+            topology.add_connection(worst, -1)
+            topology.add_connection(worst_neighbor, -1)
+            topology.remove_connection(worst, worst_neighbor)
+
+            # New errors
+            worst_error = errors[worst].unsqueeze(0)
+            pl_module.errors = torch.cat([pl_module.errors, worst_error])
+            pl_module.errors[worst] = errors[worst] * self.reduction
+            pl_module.errors[
+                worst_neighbor] = errors[worst_neighbor] * self.reduction
+
+            trainer.strategy.setup_optimizers(trainer)
--- a/src/prototorch/models/cbc.py
+++ b/src/prototorch/models/cbc.py
@@ -0,0 +1,84 @@
+import torch
+import torchmetrics
+from prototorch.core.competitions import CBCC
+from prototorch.core.components import ReasoningComponents
+from prototorch.core.initializers import RandomReasoningsInitializer
+from prototorch.core.losses import MarginLoss
+from prototorch.core.similarities import euclidean_similarity
+from prototorch.nn.wrappers import LambdaLayer
+
+from .abstract import ImagePrototypesMixin
+from .glvq import SiameseGLVQ
+
+
+class CBC(SiameseGLVQ):
+    """Classification-By-Components."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, skip_proto_layer=True, **kwargs)
+
+        similarity_fn = kwargs.get("similarity_fn", euclidean_similarity)
+        components_initializer = kwargs.get("components_initializer", None)
+        reasonings_initializer = kwargs.get("reasonings_initializer",
+                                            RandomReasoningsInitializer())
+        self.components_layer = ReasoningComponents(
+            self.hparams.distribution,
+            components_initializer=components_initializer,
+            reasonings_initializer=reasonings_initializer,
+        )
+        self.similarity_layer = LambdaLayer(similarity_fn)
+        self.competition_layer = CBCC()
+
+        # Namespace hook
+        self.proto_layer = self.components_layer
+
+        self.loss = MarginLoss(self.hparams.margin)
+
+    def forward(self, x):
+        components, reasonings = self.components_layer()
+        latent_x = self.backbone(x)
+        self.backbone.requires_grad_(self.both_path_gradients)
+        latent_components = self.backbone(components)
+        self.backbone.requires_grad_(True)
+        detections = self.similarity_layer(latent_x, latent_components)
+        probs = self.competition_layer(detections, reasonings)
+        return probs
+
+    def shared_step(self, batch, batch_idx, optimizer_idx=None):
+        x, y = batch
+        y_pred = self(x)
+        num_classes = self.num_classes
+        y_true = torch.nn.functional.one_hot(y.long(), num_classes=num_classes)
+        loss = self.loss(y_pred, y_true).mean()
+        return y_pred, loss
+
+    def training_step(self, batch, batch_idx, optimizer_idx=None):
+        y_pred, train_loss = self.shared_step(batch, batch_idx, optimizer_idx)
+        preds = torch.argmax(y_pred, dim=1)
+        accuracy = torchmetrics.functional.accuracy(
+            preds.int(),
+            batch[1].int(),
+            "multiclass",
+            num_classes=self.num_classes,
+        )
+        self.log(
+            "train_acc",
+            accuracy,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return train_loss
+
+    def predict(self, x):
+        with torch.no_grad():
+            y_pred = self(x)
+            y_pred = torch.argmax(y_pred, dim=1)
+        return y_pred
+
+
+class ImageCBC(ImagePrototypesMixin, CBC):
+    """CBC model that constrains the components to the range [0, 1] by
+    clamping after updates.
+    """
--- a/src/prototorch/models/extras.py
+++ b/src/prototorch/models/extras.py
@@ -0,0 +1,130 @@
+"""prototorch.models.extras
+
+Modules not yet available in prototorch go here temporarily.
+
+"""
+
+import torch
+from prototorch.core.similarities import gaussian
+
+
+def rank_scaled_gaussian(distances, lambd):
+    order = torch.argsort(distances, dim=1)
+    ranks = torch.argsort(order, dim=1)
+    return torch.exp(-torch.exp(-ranks / lambd) * distances)
+
+
+def orthogonalization(tensors):
+    """Orthogonalization via polar decomposition """
+    u, _, v = torch.svd(tensors, compute_uv=True)
+    u_shape = tuple(list(u.shape))
+    v_shape = tuple(list(v.shape))
+
+    # reshape to (num x N x M)
+    u = torch.reshape(u, (-1, u_shape[-2], u_shape[-1]))
+    v = torch.reshape(v, (-1, v_shape[-2], v_shape[-1]))
+
+    out = u @ v.permute([0, 2, 1])
+
+    out = torch.reshape(out, u_shape[:-1] + (v_shape[-2], ))
+
+    return out
+
+
+def ltangent_distance(x, y, omegas):
+    r"""Localized Tangent distance.
+    Compute Orthogonal Complement: math:`\bm P_k = \bm I - \Omega_k \Omega_k^T`
+    Compute Tangent Distance: math:`{\| \bm P \bm x - \bm P_k \bm y_k \|}_2`
+
+    :param `torch.tensor` omegas: Three dimensional matrix
+    :rtype: `torch.tensor`
+    """
+    x, y = (arr.view(arr.size(0), -1) for arr in (x, y))
+    p = torch.eye(omegas.shape[-2], device=omegas.device) - torch.bmm(
+        omegas, omegas.permute([0, 2, 1]))
+    projected_x = x @ p
+    projected_y = torch.diagonal(y @ p).T
+    expanded_y = torch.unsqueeze(projected_y, dim=1)
+    batchwise_difference = expanded_y - projected_x
+    differences_squared = batchwise_difference**2
+    distances = torch.sqrt(torch.sum(differences_squared, dim=2))
+    distances = distances.permute(1, 0)
+    return distances
+
+
+class GaussianPrior(torch.nn.Module):
+
+    def __init__(self, variance):
+        super().__init__()
+        self.variance = variance
+
+    def forward(self, distances):
+        return gaussian(distances, self.variance)
+
+
+class RankScaledGaussianPrior(torch.nn.Module):
+
+    def __init__(self, lambd):
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, distances):
+        return rank_scaled_gaussian(distances, self.lambd)
+
+
+class ConnectionTopology(torch.nn.Module):
+
+    def __init__(self, agelimit, num_prototypes):
+        super().__init__()
+        self.agelimit = agelimit
+        self.num_prototypes = num_prototypes
+
+        self.cmat = torch.zeros((self.num_prototypes, self.num_prototypes))
+        self.age = torch.zeros_like(self.cmat)
+
+    def forward(self, d):
+        order = torch.argsort(d, dim=1)
+
+        for element in order:
+            i0, i1 = element[0], element[1]
+
+            self.cmat[i0][i1] = 1
+            self.cmat[i1][i0] = 1
+
+            self.age[i0][i1] = 0
+            self.age[i1][i0] = 0
+
+            self.age[i0][self.cmat[i0] == 1] += 1
+            self.age[i1][self.cmat[i1] == 1] += 1
+
+            self.cmat[i0][self.age[i0] > self.agelimit] = 0
+            self.cmat[i1][self.age[i1] > self.agelimit] = 0
+
+    def get_neighbors(self, position):
+        return torch.where(self.cmat[position])
+
+    def add_prototype(self):
+        new_cmat = torch.zeros([dim + 1 for dim in self.cmat.shape])
+        new_cmat[:-1, :-1] = self.cmat
+        self.cmat = new_cmat
+
+        new_age = torch.zeros([dim + 1 for dim in self.age.shape])
+        new_age[:-1, :-1] = self.age
+        self.age = new_age
+
+    def add_connection(self, a, b):
+        self.cmat[a][b] = 1
+        self.cmat[b][a] = 1
+
+        self.age[a][b] = 0
+        self.age[b][a] = 0
+
+    def remove_connection(self, a, b):
+        self.cmat[a][b] = 0
+        self.cmat[b][a] = 0
+
+        self.age[a][b] = 0
+        self.age[b][a] = 0
+
+    def extra_repr(self):
+        return f"(agelimit): ({self.agelimit})"
--- a/src/prototorch/models/glvq.py
+++ b/src/prototorch/models/glvq.py
@@ -0,0 +1,385 @@
+"""Models based on the GLVQ framework."""
+
+import torch
+from prototorch.core.competitions import wtac
+from prototorch.core.distances import (
+    lomega_distance,
+    omega_distance,
+    squared_euclidean_distance,
+)
+from prototorch.core.initializers import EyeLinearTransformInitializer
+from prototorch.core.losses import (
+    GLVQLoss,
+    lvq1_loss,
+    lvq21_loss,
+)
+from prototorch.core.transforms import LinearTransform
+from prototorch.nn.wrappers import LambdaLayer, LossLayer
+from torch.nn.parameter import Parameter
+
+from .abstract import ImagePrototypesMixin, SupervisedPrototypeModel
+from .extras import ltangent_distance, orthogonalization
+
+
+class GLVQ(SupervisedPrototypeModel):
+    """Generalized Learning Vector Quantization."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Default hparams
+        self.hparams.setdefault("margin", 0.0)
+        self.hparams.setdefault("transfer_fn", "identity")
+        self.hparams.setdefault("transfer_beta", 10.0)
+
+        # Loss
+        self.loss = GLVQLoss(
+            margin=self.hparams["margin"],
+            transfer_fn=self.hparams["transfer_fn"],
+            beta=self.hparams["transfer_beta"],
+        )
+
+    # def on_save_checkpoint(self, checkpoint):
+    #     if "prototype_win_ratios" in checkpoint["state_dict"]:
+    #         del checkpoint["state_dict"]["prototype_win_ratios"]
+
+    def initialize_prototype_win_ratios(self):
+        self.register_buffer(
+            "prototype_win_ratios",
+            torch.zeros(self.num_prototypes, device=self.device))
+
+    def on_train_epoch_start(self):
+        self.initialize_prototype_win_ratios()
+
+    def log_prototype_win_ratios(self, distances):
+        batch_size = len(distances)
+        prototype_wc = torch.zeros(self.num_prototypes,
+                                   dtype=torch.long,
+                                   device=self.device)
+        wi, wc = torch.unique(distances.min(dim=-1).indices,
+                              sorted=True,
+                              return_counts=True)
+        prototype_wc[wi] = wc
+        prototype_wr = prototype_wc / batch_size
+        self.prototype_win_ratios = torch.vstack([
+            self.prototype_win_ratios,
+            prototype_wr,
+        ])
+
+    def shared_step(self, batch, batch_idx, optimizer_idx=None):
+        x, y = batch
+        out = self.compute_distances(x)
+        _, plabels = self.proto_layer()
+        loss = self.loss(out, y, plabels)
+        return out, loss
+
+    def training_step(self, batch, batch_idx, optimizer_idx=None):
+        out, train_loss = self.shared_step(batch, batch_idx, optimizer_idx)
+        self.log_prototype_win_ratios(out)
+        self.log("train_loss", train_loss)
+        self.log_acc(out, batch[-1], tag="train_acc")
+        return train_loss
+
+    def validation_step(self, batch, batch_idx):
+        # `model.eval()` and `torch.no_grad()` handled by pl
+        out, val_loss = self.shared_step(batch, batch_idx)
+        self.log("val_loss", val_loss)
+        self.log_acc(out, batch[-1], tag="val_acc")
+        return val_loss
+
+    def test_step(self, batch, batch_idx):
+        # `model.eval()` and `torch.no_grad()` handled by pl
+        out, test_loss = self.shared_step(batch, batch_idx)
+        self.log_acc(out, batch[-1], tag="test_acc")
+        return test_loss
+
+    def test_epoch_end(self, outputs):
+        test_loss = 0.0
+        for batch_loss in outputs:
+            test_loss += batch_loss.item()
+        self.log("test_loss", test_loss)
+
+    # TODO
+    # def predict_step(self, batch, batch_idx, dataloader_idx=None):
+    #     pass
+
+
+class SiameseGLVQ(GLVQ):
+    """GLVQ in a Siamese setting.
+
+    GLVQ model that applies an arbitrary transformation on the inputs and the
+    prototypes before computing the distances between them. The weights in the
+    transformation pipeline are only learned from the inputs.
+
+    """
+
+    def __init__(self,
+                 hparams,
+                 backbone=torch.nn.Identity(),
+                 both_path_gradients=False,
+                 **kwargs):
+        distance_fn = kwargs.pop("distance_fn", squared_euclidean_distance)
+        super().__init__(hparams, distance_fn=distance_fn, **kwargs)
+        self.backbone = backbone
+        self.both_path_gradients = both_path_gradients
+
+    def compute_distances(self, x):
+        protos, _ = self.proto_layer()
+        x, protos = (arr.view(arr.size(0), -1) for arr in (x, protos))
+        latent_x = self.backbone(x)
+
+        bb_grad = any([el.requires_grad for el in self.backbone.parameters()])
+
+        self.backbone.requires_grad_(bb_grad and self.both_path_gradients)
+        latent_protos = self.backbone(protos)
+        self.backbone.requires_grad_(bb_grad)
+
+        distances = self.distance_layer(latent_x, latent_protos)
+        return distances
+
+    def predict_latent(self, x, map_protos=True):
+        """Predict `x` assuming it is already embedded in the latent space.
+
+        Only the prototypes are embedded in the latent space using the
+        backbone.
+
+        """
+        self.eval()
+        with torch.no_grad():
+            protos, plabels = self.proto_layer()
+            if map_protos:
+                protos = self.backbone(protos)
+            d = self.distance_layer(x, protos)
+            y_pred = wtac(d, plabels)
+        return y_pred
+
+
+class LVQMLN(SiameseGLVQ):
+    """Learning Vector Quantization Multi-Layer Network.
+
+    GLVQ model that applies an arbitrary transformation on the inputs, BUT NOT
+    on the prototypes before computing the distances between them. This of
+    course, means that the prototypes no longer live the input space, but
+    rather in the embedding space.
+
+    """
+
+    def compute_distances(self, x):
+        latent_protos, _ = self.proto_layer()
+        latent_x = self.backbone(x)
+        distances = self.distance_layer(latent_x, latent_protos)
+        return distances
+
+
+class GRLVQ(SiameseGLVQ):
+    """Generalized Relevance Learning Vector Quantization.
+
+    Implemented as a Siamese network with a linear transformation backbone.
+
+    TODO Make a RelevanceLayer. `bb_lr` is ignored otherwise.
+
+    """
+    _relevances: torch.Tensor
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Additional parameters
+        relevances = torch.ones(self.hparams["input_dim"], device=self.device)
+        self.register_parameter("_relevances", Parameter(relevances))
+
+        # Override the backbone
+        self.backbone = LambdaLayer(self._apply_relevances,
+                                    name="relevance scaling")
+
+    def _apply_relevances(self, x):
+        return x @ torch.diag(self._relevances)
+
+    @property
+    def relevance_profile(self):
+        return self._relevances.detach().cpu()
+
+    def extra_repr(self):
+        return f"(relevances): (shape: {tuple(self._relevances.shape)})"
+
+
+class SiameseGMLVQ(SiameseGLVQ):
+    """Generalized Matrix Learning Vector Quantization.
+
+    Implemented as a Siamese network with a linear transformation backbone.
+
+    """
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Override the backbone
+        omega_initializer = kwargs.get("omega_initializer",
+                                       EyeLinearTransformInitializer())
+        self.backbone = LinearTransform(
+            self.hparams["input_dim"],
+            self.hparams["latent_dim"],
+            initializer=omega_initializer,
+        )
+
+    @property
+    def omega_matrix(self):
+        return self.backbone.weights
+
+    @property
+    def lambda_matrix(self):
+        omega = self.backbone.weights  # (input_dim, latent_dim)
+        lam = omega @ omega.T
+        return lam.detach().cpu()
+
+
+class GMLVQ(GLVQ):
+    """Generalized Matrix Learning Vector Quantization.
+
+    Implemented as a regular GLVQ network that simply uses a different distance
+    function. This makes it easier to implement a localized variant.
+
+    """
+
+    # Parameters
+    _omega: torch.Tensor
+
+    def __init__(self, hparams, **kwargs):
+        distance_fn = kwargs.pop("distance_fn", omega_distance)
+        super().__init__(hparams, distance_fn=distance_fn, **kwargs)
+
+        # Additional parameters
+        omega_initializer = kwargs.get("omega_initializer",
+                                       EyeLinearTransformInitializer())
+        omega = omega_initializer.generate(self.hparams["input_dim"],
+                                           self.hparams["latent_dim"])
+        self.register_parameter("_omega", Parameter(omega))
+
+    @property
+    def omega_matrix(self):
+        return self._omega.detach().cpu()
+
+    @property
+    def lambda_matrix(self):
+        omega = self._omega.detach()  # (input_dim, latent_dim)
+        lam = omega @ omega.T
+        return lam.detach().cpu()
+
+    def compute_distances(self, x):
+        protos, _ = self.proto_layer()
+        distances = self.distance_layer(x, protos, self._omega)
+        return distances
+
+    def extra_repr(self):
+        return f"(omega): (shape: {tuple(self._omega.shape)})"
+
+
+class LGMLVQ(GMLVQ):
+    """Localized and Generalized Matrix Learning Vector Quantization."""
+
+    def __init__(self, hparams, **kwargs):
+        distance_fn = kwargs.pop("distance_fn", lomega_distance)
+        super().__init__(hparams, distance_fn=distance_fn, **kwargs)
+
+        # Re-register `_omega` to override the one from the super class.
+        omega = torch.randn(
+            self.num_prototypes,
+            self.hparams["input_dim"],
+            self.hparams["latent_dim"],
+            device=self.device,
+        )
+        self.register_parameter("_omega", Parameter(omega))
+
+
+class GTLVQ(LGMLVQ):
+    """Localized and Generalized Tangent Learning Vector Quantization."""
+
+    def __init__(self, hparams, **kwargs):
+        distance_fn = kwargs.pop("distance_fn", ltangent_distance)
+        super().__init__(hparams, distance_fn=distance_fn, **kwargs)
+
+        omega_initializer = kwargs.get("omega_initializer")
+
+        if omega_initializer is not None:
+            subspace = omega_initializer.generate(
+                self.hparams["input_dim"],
+                self.hparams["latent_dim"],
+            )
+            omega = torch.repeat_interleave(
+                subspace.unsqueeze(0),
+                self.num_prototypes,
+                dim=0,
+            )
+        else:
+            omega = torch.rand(
+                self.num_prototypes,
+                self.hparams["input_dim"],
+                self.hparams["latent_dim"],
+                device=self.device,
+            )
+
+        # Re-register `_omega` to override the one from the super class.
+        self.register_parameter("_omega", Parameter(omega))
+
+    def on_train_batch_end(self, outputs, batch, batch_idx):
+        with torch.no_grad():
+            self._omega.copy_(orthogonalization(self._omega))
+
+
+class SiameseGTLVQ(SiameseGLVQ, GTLVQ):
+    """Generalized Tangent Learning Vector Quantization.
+
+    Implemented as a Siamese network with a linear transformation backbone.
+
+    """
+
+
+class GLVQ1(GLVQ):
+    """Generalized Learning Vector Quantization 1."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+        self.loss = LossLayer(lvq1_loss)
+        self.optimizer = torch.optim.SGD
+
+
+class GLVQ21(GLVQ):
+    """Generalized Learning Vector Quantization 2.1."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+        self.loss = LossLayer(lvq21_loss)
+        self.optimizer = torch.optim.SGD
+
+
+class ImageGLVQ(ImagePrototypesMixin, GLVQ):
+    """GLVQ for training on image data.
+
+    GLVQ model that constrains the prototypes to the range [0, 1] by clamping
+    after updates.
+
+    """
+
+
+class ImageGMLVQ(ImagePrototypesMixin, GMLVQ):
+    """GMLVQ for training on image data.
+
+    GMLVQ model that constrains the prototypes to the range [0, 1] by clamping
+    after updates.
+
+    """
+
+
+class ImageGTLVQ(ImagePrototypesMixin, GTLVQ):
+    """GTLVQ for training on image data.
+
+    GTLVQ model that constrains the prototypes to the range [0, 1] by clamping
+    after updates.
+
+    """
+
+    def on_train_batch_end(self, outputs, batch, batch_idx):
+        """Constrain the components to the range [0, 1] by clamping after updates."""
+        self.proto_layer.components.data.clamp_(0.0, 1.0)
+        with torch.no_grad():
+            self._omega.copy_(orthogonalization(self._omega))
--- a/src/prototorch/models/knn.py
+++ b/src/prototorch/models/knn.py
@@ -0,0 +1,45 @@
+"""ProtoTorch KNN model."""
+
+import warnings
+
+from prototorch.core.competitions import KNNC
+from prototorch.core.components import LabeledComponents
+from prototorch.core.initializers import (
+    LiteralCompInitializer,
+    LiteralLabelsInitializer,
+)
+from prototorch.utils.utils import parse_data_arg
+
+from .abstract import SupervisedPrototypeModel
+
+
+class KNN(SupervisedPrototypeModel):
+    """K-Nearest-Neighbors classification algorithm."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, skip_proto_layer=True, **kwargs)
+
+        # Default hparams
+        self.hparams.setdefault("k", 1)
+
+        data = kwargs.get("data", None)
+        if data is None:
+            raise ValueError("KNN requires data, but was not provided!")
+        data, targets = parse_data_arg(data)
+
+        # Layers
+        self.proto_layer = LabeledComponents(
+            distribution=len(data) * [1],
+            components_initializer=LiteralCompInitializer(data),
+            labels_initializer=LiteralLabelsInitializer(targets))
+        self.competition_layer = KNNC(k=self.hparams.k)
+
+    def training_step(self, train_batch, batch_idx, optimizer_idx=None):
+        return 1  # skip training step
+
+    def on_train_batch_start(self, train_batch, batch_idx):
+        warnings.warn("k-NN has no training, skipping!")
+        return -1
+
+    def configure_optimizers(self):
+        return None
--- a/src/prototorch/models/lvq.py
+++ b/src/prototorch/models/lvq.py
@@ -0,0 +1,128 @@
+"""LVQ models that are optimized using non-gradient methods."""
+
+import logging
+
+from prototorch.core.losses import _get_dp_dm
+from prototorch.nn.activations import get_activation
+from prototorch.nn.wrappers import LambdaLayer
+
+from .abstract import NonGradientMixin
+from .glvq import GLVQ
+
+
+class LVQ1(NonGradientMixin, GLVQ):
+    """Learning Vector Quantization 1."""
+
+    def training_step(self, train_batch, batch_idx, optimizer_idx=None):
+        protos, plables = self.proto_layer()
+        x, y = train_batch
+        dis = self.compute_distances(x)
+        # TODO Vectorized implementation
+
+        for xi, yi in zip(x, y):
+            d = self.compute_distances(xi.view(1, -1))
+            preds = self.competition_layer(d, plabels)
+            w = d.argmin(1)
+            if yi == preds:
+                shift = xi - protos[w]
+            else:
+                shift = protos[w] - xi
+            updated_protos = protos + 0.0
+            updated_protos[w] = protos[w] + (self.hparams.lr * shift)
+            self.proto_layer.load_state_dict({"_components": updated_protos},
+                                             strict=False)
+
+        logging.debug(f"dis={dis}")
+        logging.debug(f"y={y}")
+        # Logging
+        self.log_acc(dis, y, tag="train_acc")
+
+        return None
+
+
+class LVQ21(NonGradientMixin, GLVQ):
+    """Learning Vector Quantization 2.1."""
+
+    def training_step(self, train_batch, batch_idx, optimizer_idx=None):
+        protos, plabels = self.proto_layer()
+
+        x, y = train_batch
+        dis = self.compute_distances(x)
+        # TODO Vectorized implementation
+
+        for xi, yi in zip(x, y):
+            xi = xi.view(1, -1)
+            yi = yi.view(1, )
+            d = self.compute_distances(xi)
+            (_, wp), (_, wn) = _get_dp_dm(d, yi, plabels, with_indices=True)
+            shiftp = xi - protos[wp]
+            shiftn = protos[wn] - xi
+            updated_protos = protos + 0.0
+            updated_protos[wp] = protos[wp] + (self.hparams.lr * shiftp)
+            updated_protos[wn] = protos[wn] + (self.hparams.lr * shiftn)
+            self.proto_layer.load_state_dict({"_components": updated_protos},
+                                             strict=False)
+
+        # Logging
+        self.log_acc(dis, y, tag="train_acc")
+
+        return None
+
+
+class MedianLVQ(NonGradientMixin, GLVQ):
+    """Median LVQ
+
+    # TODO Avoid computing distances over and over
+
+    """
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        self.transfer_layer = LambdaLayer(
+            get_activation(self.hparams.transfer_fn))
+
+    def _f(self, x, y, protos, plabels):
+        d = self.distance_layer(x, protos)
+        dp, dm = _get_dp_dm(d, y, plabels)
+        mu = (dp - dm) / (dp + dm)
+        invmu = -1.0 * mu
+        f = self.transfer_layer(invmu, beta=self.hparams.transfer_beta) + 1.0
+        return f
+
+    def expectation(self, x, y, protos, plabels):
+        f = self._f(x, y, protos, plabels)
+        gamma = f / f.sum()
+        return gamma
+
+    def lower_bound(self, x, y, protos, plabels, gamma):
+        f = self._f(x, y, protos, plabels)
+        lower_bound = (gamma * f.log()).sum()
+        return lower_bound
+
+    def training_step(self, train_batch, batch_idx, optimizer_idx=None):
+        protos, plabels = self.proto_layer()
+
+        x, y = train_batch
+        dis = self.compute_distances(x)
+
+        for i, _ in enumerate(protos):
+            # Expectation step
+            gamma = self.expectation(x, y, protos, plabels)
+            lower_bound = self.lower_bound(x, y, protos, plabels, gamma)
+
+            # Maximization step
+            _protos = protos + 0
+            for k, xk in enumerate(x):
+                _protos[i] = xk
+                _lower_bound = self.lower_bound(x, y, _protos, plabels, gamma)
+                if _lower_bound > lower_bound:
+                    logging.debug(f"Updating prototype {i} to data {k}...")
+                    self.proto_layer.load_state_dict({"_components": _protos},
+                                                     strict=False)
+                    break
+
+        # Logging
+        self.log_acc(dis, y, tag="train_acc")
+
+        return None
--- a/src/prototorch/models/probabilistic.py
+++ b/src/prototorch/models/probabilistic.py
@@ -0,0 +1,131 @@
+"""Probabilistic GLVQ methods"""
+
+import torch
+from prototorch.core.losses import nllr_loss, rslvq_loss
+from prototorch.core.pooling import (
+    stratified_min_pooling,
+    stratified_sum_pooling,
+)
+from prototorch.nn.wrappers import LossLayer
+
+from .extras import GaussianPrior, RankScaledGaussianPrior
+from .glvq import GLVQ, SiameseGMLVQ
+
+
+class CELVQ(GLVQ):
+    """Cross-Entropy Learning Vector Quantization."""
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Loss
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def shared_step(self, batch, batch_idx, optimizer_idx=None):
+        x, y = batch
+        out = self.compute_distances(x)  # [None, num_protos]
+        _, plabels = self.proto_layer()
+        winning = stratified_min_pooling(out, plabels)  # [None, num_classes]
+        probs = -1.0 * winning
+        batch_loss = self.loss(probs, y.long())
+        loss = batch_loss.sum()
+        return out, loss
+
+
+class ProbabilisticLVQ(GLVQ):
+
+    def __init__(self, hparams, rejection_confidence=0.0, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        self.rejection_confidence = rejection_confidence
+        self._conditional_distribution = None
+
+    def forward(self, x):
+        distances = self.compute_distances(x)
+
+        conditional = self.conditional_distribution(distances)
+        prior = (1. / self.num_prototypes) * torch.ones(self.num_prototypes,
+                                                        device=self.device)
+        posterior = conditional * prior
+
+        plabels = self.proto_layer._labels
+        if isinstance(plabels, torch.LongTensor) or isinstance(
+                plabels, torch.cuda.LongTensor):  # type: ignore
+            y_pred = stratified_sum_pooling(posterior, plabels)  # type: ignore
+        else:
+            raise ValueError("Labels must be LongTensor.")
+
+        return y_pred
+
+    def predict(self, x):
+        y_pred = self.forward(x)
+        confidence, prediction = torch.max(y_pred, dim=1)
+        prediction[confidence < self.rejection_confidence] = -1
+        return prediction
+
+    def training_step(self, batch, batch_idx, optimizer_idx=None):
+        x, y = batch
+        out = self.forward(x)
+        _, plabels = self.proto_layer()
+        batch_loss = self.loss(out, y, plabels)
+        loss = batch_loss.sum()
+        return loss
+
+    def conditional_distribution(self, distances):
+        """Conditional distribution of distances."""
+        if self._conditional_distribution is None:
+            raise ValueError("Conditional distribution is not set.")
+        return self._conditional_distribution(distances)
+
+
+class SLVQ(ProbabilisticLVQ):
+    """Soft Learning Vector Quantization."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Default hparams
+        self.hparams.setdefault("variance", 1.0)
+        variance = self.hparams.get("variance")
+
+        self._conditional_distribution = GaussianPrior(variance)
+        self.loss = LossLayer(nllr_loss)
+
+
+class RSLVQ(ProbabilisticLVQ):
+    """Robust Soft Learning Vector Quantization."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Default hparams
+        self.hparams.setdefault("variance", 1.0)
+        variance = self.hparams.get("variance")
+
+        self._conditional_distribution = GaussianPrior(variance)
+        self.loss = LossLayer(rslvq_loss)
+
+
+class PLVQ(ProbabilisticLVQ, SiameseGMLVQ):
+    """Probabilistic Learning Vector Quantization.
+
+    TODO: Use Backbone LVQ instead
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Default hparams
+        self.hparams.setdefault("lambda", 1.0)
+        lam = self.hparams.get("lambda", 1.0)
+
+        self.conditional_distribution = RankScaledGaussianPrior(lam)
+        self.loss = torch.nn.KLDivLoss()
+
+    # FIXME
+    # def training_step(self, batch, batch_idx, optimizer_idx=None):
+    #     x, y = batch
+    #     y_pred = self(x)
+    #     batch_loss = self.loss(y_pred, y)
+    #     loss = batch_loss.sum()
+    #     return loss
--- a/src/prototorch/models/unsupervised.py
+++ b/src/prototorch/models/unsupervised.py
@@ -0,0 +1,154 @@
+"""Unsupervised prototype learning algorithms."""
+
+import numpy as np
+import torch
+from prototorch.core.competitions import wtac
+from prototorch.core.distances import squared_euclidean_distance
+from prototorch.core.losses import NeuralGasEnergy
+
+from .abstract import NonGradientMixin, UnsupervisedPrototypeModel
+from .callbacks import GNGCallback
+from .extras import ConnectionTopology
+
+
+class KohonenSOM(NonGradientMixin, UnsupervisedPrototypeModel):
+    """Kohonen Self-Organizing-Map.
+
+    TODO Allow non-2D grids
+
+    """
+    _grid: torch.Tensor
+
+    def __init__(self, hparams, **kwargs):
+        h, w = hparams.get("shape")
+        # Ignore `num_prototypes`
+        hparams["num_prototypes"] = h * w
+        distance_fn = kwargs.pop("distance_fn", squared_euclidean_distance)
+        super().__init__(hparams, distance_fn=distance_fn, **kwargs)
+
+        # Hyperparameters
+        self.save_hyperparameters(hparams)
+
+        # Default hparams
+        self.hparams.setdefault("alpha", 0.3)
+        self.hparams.setdefault("sigma", max(h, w) / 2.0)
+
+        # Additional parameters
+        x, y = torch.arange(h), torch.arange(w)
+        grid = torch.stack(torch.meshgrid(x, y, indexing="ij"), dim=-1)
+        self.register_buffer("_grid", grid)
+        self._sigma = self.hparams.sigma
+        self._lr = self.hparams.lr
+
+    def predict_from_distances(self, distances):
+        grid = self._grid.view(-1, 2)
+        wp = wtac(distances, grid)
+        return wp
+
+    def training_step(self, train_batch, batch_idx):
+        # x = train_batch
+        # TODO Check if the batch has labels
+        x = train_batch[0]
+        d = self.compute_distances(x)
+        wp = self.predict_from_distances(d)
+        grid = self._grid.view(-1, 2)
+        gd = squared_euclidean_distance(wp, grid)
+        nh = torch.exp(-gd / self._sigma**2)
+        protos = self.proto_layer()
+        diff = x.unsqueeze(dim=1) - protos
+        delta = self._lr * self.hparams.alpha * nh.unsqueeze(-1) * diff
+        updated_protos = protos + delta.sum(dim=0)
+        self.proto_layer.load_state_dict(
+            {"_components": updated_protos},
+            strict=False,
+        )
+
+    def on_training_epoch_end(self, training_step_outputs):
+        self._sigma = self.hparams.sigma * np.exp(
+            -self.current_epoch / self.trainer.max_epochs)
+
+    def extra_repr(self):
+        return f"(grid): (shape: {tuple(self._grid.shape)})"
+
+
+class HeskesSOM(UnsupervisedPrototypeModel):
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+    def training_step(self, train_batch, batch_idx):
+        # TODO Implement me!
+        raise NotImplementedError()
+
+
+class NeuralGas(UnsupervisedPrototypeModel):
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Hyperparameters
+        self.save_hyperparameters(hparams)
+
+        # Default hparams
+        self.hparams.setdefault("age_limit", 10)
+        self.hparams.setdefault("lm", 1)
+
+        self.energy_layer = NeuralGasEnergy(lm=self.hparams["lm"])
+        self.topology_layer = ConnectionTopology(
+            agelimit=self.hparams["age_limit"],
+            num_prototypes=self.hparams["num_prototypes"],
+        )
+
+    def training_step(self, train_batch, batch_idx):
+        # x = train_batch
+        # TODO Check if the batch has labels
+        x = train_batch[0]
+        d = self.compute_distances(x)
+        loss, _ = self.energy_layer(d)
+        self.topology_layer(d)
+        self.log("loss", loss)
+        return loss
+
+
+class GrowingNeuralGas(NeuralGas):
+    errors: torch.Tensor
+
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+
+        # Defaults
+        self.hparams.setdefault("step_reduction", 0.5)
+        self.hparams.setdefault("insert_reduction", 0.1)
+        self.hparams.setdefault("insert_freq", 10)
+
+        errors = torch.zeros(
+            self.hparams["num_prototypes"],
+            device=self.device,
+        )
+        self.register_buffer("errors", errors)
+
+    def training_step(self, train_batch, _batch_idx):
+        # x = train_batch
+        # TODO Check if the batch has labels
+        x = train_batch[0]
+        d = self.compute_distances(x)
+        loss, order = self.energy_layer(d)
+        winner = order[:, 0]
+        mask = torch.zeros_like(d)
+        mask[torch.arange(len(mask)), winner] = 1.0
+        dp = d * mask
+
+        self.errors += torch.sum(dp * dp)
+        self.errors *= self.hparams["step_reduction"]
+
+        self.topology_layer(d)
+        self.log("loss", loss)
+        return loss
+
+    def configure_callbacks(self):
+        return [
+            GNGCallback(
+                reduction=self.hparams["insert_reduction"],
+                freq=self.hparams["insert_freq"],
+            )
+        ]
--- a/src/prototorch/models/vis.py
+++ b/src/prototorch/models/vis.py
@@ -0,0 +1,363 @@
+"""Visualization Callbacks."""
+
+import warnings
+from typing import Sized
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchvision
+from matplotlib import pyplot as plt
+from prototorch.utils.colors import get_colors, get_legend_handles
+from prototorch.utils.utils import mesh2d
+from pytorch_lightning.loggers import TensorBoardLogger
+from torch.utils.data import DataLoader, Dataset
+
+
+class Vis2DAbstract(pl.Callback):
+
+    def __init__(self,
+                 data=None,
+                 title="Prototype Visualization",
+                 cmap="viridis",
+                 xlabel="Data dimension 1",
+                 ylabel="Data dimension 2",
+                 legend_labels=None,
+                 border=0.1,
+                 resolution=100,
+                 flatten_data=True,
+                 axis_off=False,
+                 show_protos=True,
+                 show=True,
+                 tensorboard=False,
+                 show_last_only=False,
+                 pause_time=0.1,
+                 block=False):
+        super().__init__()
+
+        if data:
+            if isinstance(data, Dataset):
+                if isinstance(data, Sized):
+                    x, y = next(iter(DataLoader(data, batch_size=len(data))))
+                else:
+                    # TODO: Add support for non-sized datasets
+                    raise NotImplementedError(
+                        "Data must be a dataset with a __len__ method.")
+            elif isinstance(data, DataLoader):
+                x = torch.tensor([])
+                y = torch.tensor([])
+                for x_b, y_b in data:
+                    x = torch.cat([x, x_b])
+                    y = torch.cat([y, y_b])
+            else:
+                x, y = data
+
+            if flatten_data:
+                x = x.reshape(len(x), -1)
+
+            self.x_train = x
+            self.y_train = y
+        else:
+            self.x_train = None
+            self.y_train = None
+
+        self.title = title
+        self.xlabel = xlabel
+        self.ylabel = ylabel
+        self.legend_labels = legend_labels
+        self.fig = plt.figure(self.title)
+        self.cmap = cmap
+        self.border = border
+        self.resolution = resolution
+        self.axis_off = axis_off
+        self.show_protos = show_protos
+        self.show = show
+        self.tensorboard = tensorboard
+        self.show_last_only = show_last_only
+        self.pause_time = pause_time
+        self.block = block
+
+    def precheck(self, trainer):
+        if self.show_last_only:
+            if trainer.current_epoch != trainer.max_epochs - 1:
+                return False
+        return True
+
+    def setup_ax(self):
+        ax = self.fig.gca()
+        ax.cla()
+        ax.set_title(self.title)
+        ax.set_xlabel(self.xlabel)
+        ax.set_ylabel(self.ylabel)
+        if self.axis_off:
+            ax.axis("off")
+        return ax
+
+    def plot_data(self, ax, x, y):
+        ax.scatter(
+            x[:, 0],
+            x[:, 1],
+            c=y,
+            cmap=self.cmap,
+            edgecolor="k",
+            marker="o",
+            s=30,
+        )
+
+    def plot_protos(self, ax, protos, plabels):
+        ax.scatter(
+            protos[:, 0],
+            protos[:, 1],
+            c=plabels,
+            cmap=self.cmap,
+            edgecolor="k",
+            marker="D",
+            s=50,
+        )
+
+    def add_to_tensorboard(self, trainer, pl_module):
+        tb = pl_module.logger.experiment
+        tb.add_figure(tag=f"{self.title}",
+                      figure=self.fig,
+                      global_step=trainer.current_epoch,
+                      close=False)
+
+    def log_and_display(self, trainer, pl_module):
+        if self.tensorboard:
+            self.add_to_tensorboard(trainer, pl_module)
+        if self.show:
+            if not self.block:
+                plt.pause(self.pause_time)
+            else:
+                plt.show(block=self.block)
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        if not self.precheck(trainer):
+            return True
+        self.visualize(pl_module)
+        self.log_and_display(trainer, pl_module)
+
+    def on_train_end(self, trainer, pl_module):
+        plt.close()
+
+    def visualize(self, pl_module):
+        raise NotImplementedError
+
+
+class VisGLVQ2D(Vis2DAbstract):
+
+    def visualize(self, pl_module):
+        protos = pl_module.prototypes
+        plabels = pl_module.prototype_labels
+        x_train, y_train = self.x_train, self.y_train
+        ax = self.setup_ax()
+        self.plot_protos(ax, protos, plabels)
+        if x_train is not None:
+            self.plot_data(ax, x_train, y_train)
+            mesh_input, xx, yy = mesh2d(np.vstack([x_train, protos]),
+                                        self.border, self.resolution)
+        else:
+            mesh_input, xx, yy = mesh2d(protos, self.border, self.resolution)
+        _components = pl_module.proto_layer._components
+        mesh_input = torch.from_numpy(mesh_input).type_as(_components)
+        y_pred = pl_module.predict(mesh_input)
+        y_pred = y_pred.cpu().reshape(xx.shape)
+        ax.contourf(xx, yy, y_pred, cmap=self.cmap, alpha=0.35)
+
+
+class VisSiameseGLVQ2D(Vis2DAbstract):
+
+    def __init__(self, *args, map_protos=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.map_protos = map_protos
+
+    def visualize(self, pl_module):
+        protos = pl_module.prototypes
+        plabels = pl_module.prototype_labels
+        x_train, y_train = self.x_train, self.y_train
+        device = pl_module.device
+        with torch.no_grad():
+            x_train = pl_module.backbone(torch.Tensor(x_train).to(device))
+            x_train = x_train.cpu().detach()
+        if self.map_protos:
+            with torch.no_grad():
+                protos = pl_module.backbone(torch.Tensor(protos).to(device))
+                protos = protos.cpu().detach()
+        ax = self.setup_ax()
+        self.plot_data(ax, x_train, y_train)
+        if self.show_protos:
+            self.plot_protos(ax, protos, plabels)
+            x = np.vstack((x_train, protos))
+            mesh_input, xx, yy = mesh2d(x, self.border, self.resolution)
+        else:
+            mesh_input, xx, yy = mesh2d(x_train, self.border, self.resolution)
+        _components = pl_module.proto_layer._components
+        mesh_input = torch.Tensor(mesh_input).type_as(_components)
+        y_pred = pl_module.predict_latent(mesh_input,
+                                          map_protos=self.map_protos)
+        y_pred = y_pred.cpu().reshape(xx.shape)
+        ax.contourf(xx, yy, y_pred, cmap=self.cmap, alpha=0.35)
+
+
+class VisGMLVQ2D(Vis2DAbstract):
+
+    def __init__(self, *args, ev_proj=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ev_proj = ev_proj
+
+    def visualize(self, pl_module):
+        protos = pl_module.prototypes
+        plabels = pl_module.prototype_labels
+        x_train, y_train = self.x_train, self.y_train
+        device = pl_module.device
+        omega = pl_module._omega.detach()
+        lam = omega @ omega.T
+        u, _, _ = torch.pca_lowrank(lam, q=2)
+        with torch.no_grad():
+            x_train = torch.Tensor(x_train).to(device)
+            x_train = x_train @ u
+            x_train = x_train.cpu().detach()
+        if self.show_protos:
+            with torch.no_grad():
+                protos = torch.Tensor(protos).to(device)
+                protos = protos @ u
+                protos = protos.cpu().detach()
+        ax = self.setup_ax()
+        self.plot_data(ax, x_train, y_train)
+        if self.show_protos:
+            self.plot_protos(ax, protos, plabels)
+
+
+class VisCBC2D(Vis2DAbstract):
+
+    def visualize(self, pl_module):
+        x_train, y_train = self.x_train, self.y_train
+        protos = pl_module.components
+        ax = self.setup_ax()
+        self.plot_data(ax, x_train, y_train)
+        self.plot_protos(ax, protos, "w")
+        x = np.vstack((x_train, protos))
+        mesh_input, xx, yy = mesh2d(x, self.border, self.resolution)
+        _components = pl_module.components_layer._components
+        y_pred = pl_module.predict(
+            torch.Tensor(mesh_input).type_as(_components))
+        y_pred = y_pred.cpu().reshape(xx.shape)
+
+        ax.contourf(xx, yy, y_pred, cmap=self.cmap, alpha=0.35)
+
+
+class VisNG2D(Vis2DAbstract):
+
+    def visualize(self, pl_module):
+        x_train, y_train = self.x_train, self.y_train
+        protos = pl_module.prototypes
+        cmat = pl_module.topology_layer.cmat.cpu().numpy()
+
+        ax = self.setup_ax()
+        self.plot_data(ax, x_train, y_train)
+        self.plot_protos(ax, protos, "w")
+
+        # Draw connections
+        for i in range(len(protos)):
+            for j in range(i, len(protos)):
+                if cmat[i][j]:
+                    ax.plot(
+                        [protos[i, 0], protos[j, 0]],
+                        [protos[i, 1], protos[j, 1]],
+                        "k-",
+                    )
+
+
+class VisSpectralProtos(Vis2DAbstract):
+
+    def visualize(self, pl_module):
+        protos = pl_module.prototypes
+        plabels = pl_module.prototype_labels
+        ax = self.setup_ax()
+        colors = get_colors(vmax=max(plabels), vmin=min(plabels))
+        for p, pl in zip(protos, plabels):
+            ax.plot(p, c=colors[int(pl)])
+        if self.legend_labels:
+            handles = get_legend_handles(
+                colors,
+                self.legend_labels,
+                marker="lines",
+            )
+            ax.legend(handles=handles)
+
+
+class VisImgComp(Vis2DAbstract):
+
+    def __init__(self,
+                 *args,
+                 random_data=0,
+                 dataformats="CHW",
+                 num_columns=2,
+                 add_embedding=False,
+                 embedding_data=100,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.random_data = random_data
+        self.dataformats = dataformats
+        self.num_columns = num_columns
+        self.add_embedding = add_embedding
+        self.embedding_data = embedding_data
+
+    def on_train_start(self, _, pl_module):
+        if isinstance(pl_module.logger, TensorBoardLogger):
+            tb = pl_module.logger.experiment
+
+            # Add embedding
+            if self.add_embedding:
+                if self.x_train is not None and self.y_train is not None:
+                    ind = np.random.choice(len(self.x_train),
+                                           size=self.embedding_data,
+                                           replace=False)
+                    data = self.x_train[ind]
+                    tb.add_embedding(data.view(len(ind), -1),
+                                     label_img=data,
+                                     global_step=None,
+                                     tag="Data Embedding",
+                                     metadata=self.y_train[ind],
+                                     metadata_header=None)
+                else:
+                    raise ValueError("No data for add embedding flag")
+
+            # Random Data
+            if self.random_data:
+                if self.x_train is not None:
+                    ind = np.random.choice(len(self.x_train),
+                                           size=self.random_data,
+                                           replace=False)
+                    data = self.x_train[ind]
+                    grid = torchvision.utils.make_grid(data,
+                                                       nrow=self.num_columns)
+                    tb.add_image(tag="Data",
+                                 img_tensor=grid,
+                                 global_step=None,
+                                 dataformats=self.dataformats)
+                else:
+                    raise ValueError("No data for random data flag")
+
+        else:
+            warnings.warn(
+                f"TensorBoardLogger is required, got {type(pl_module.logger)}")
+
+    def add_to_tensorboard(self, trainer, pl_module):
+        tb = pl_module.logger.experiment
+
+        components = pl_module.components
+        grid = torchvision.utils.make_grid(components, nrow=self.num_columns)
+        tb.add_image(
+            tag="Components",
+            img_tensor=grid,
+            global_step=trainer.current_epoch,
+            dataformats=self.dataformats,
+        )
+
+    def visualize(self, pl_module):
+        if self.show:
+            components = pl_module.components
+            grid = torchvision.utils.make_grid(components,
+                                               nrow=self.num_columns)
+            plt.imshow(grid.permute((1, 2, 0)).cpu(), cmap=self.cmap)