Source code for tno.quantum.ml.clustering.kmedoids._kmedoids

"""This module contains a scikit-learn compatible, quantum K-medoid clustering."""

from __future__ import annotations

import warnings
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any, SupportsFloat, SupportsInt, TypeVar, cast

import numpy as np
from numpy.typing import ArrayLike, NDArray
from sklearn.base import ClusterMixin, Tags
from sklearn.metrics.pairwise import pairwise_distances
from tno.quantum.ml.components import QUBOEstimator, get_default_solver_config_if_none
from tno.quantum.optimization.qubo.components import QUBO, SolverConfig
from tno.quantum.utils.validation import check_int, check_real

if TYPE_CHECKING:
    from tno.quantum.utils import BitVector

QKMedoidsT = TypeVar("QKMedoidsT", bound="QKMedoids")



[docs]
class QKMedoids(ClusterMixin, QUBOEstimator):  # type: ignore[misc]
    """Quantum K-medoid clustering."""


[docs]
    def __init__(  # noqa: PLR0913
        self,
        n_clusters: SupportsInt = 2,
        alpha: SupportsFloat = 1,
        beta: SupportsFloat = 1,
        gamma: SupportsFloat = 2,
        metric: str | Callable[[ArrayLike, ArrayLike], float] = "euclidean",
        solver_config: SolverConfig | Mapping[str, Any] | None = None,
    ) -> None:
        r"""Init ``QKMedoid`` clustering algorithm.

        The QKMedoid algorithm minimizes a double objective function with a constraint
        translated to a QUBO formulation. The first objective maximizes the distance
        between the medoids, while the second objective tries to make the medoid as
        centrally positioned as possible. The constraint restricts the number of medoids
        to be exactly equal to $k$. The QUBO is an implementation of the work shown in
        `[1] <https://ceur-ws.org/Vol-2454/paper_39.pdf>`_ and
        `[2] <https://link.springer.com/chapter/10.1007/978-3-031-40852-6_12>`_.
        Its cost function is given by:


        .. _QUBO cost function:
        .. math::

            \sum_{i=1}^N\sum_{j=1}^Nz_iz_j\left(\gamma-\frac{1}{2}\alpha\Delta_{ij}\right)
            +
            \sum_{i=1}^Nz_i\left(\beta\sum_{j=1}^N\Delta_{ij}-2\gamma k\right).

        **Details:**

        * $z_{i}$ are the binary decision variables.
        * $N$ is the number of datapoints.
        * $k$ is the number of clusters.
        * $\Delta_{ij}=1-\exp(-\frac{1}{2}d_{ij})$, where $d_{ij}$ is the distance between data point $i$ and $j$.
        * $\alpha$ is the weight parameter of the first objective, i.e., maximizing the distance between the medoids.
        * $\beta$ is the weight parameter of the second objective, i.e., centralizing the medoids.
        * $\gamma$ is the penalty parameter for the constraint that there must be exactly $k$ medoids.

        Args:
            n_clusters: The number of clusters to form ($k$). Must be a strictly
                positive integer value.
            alpha: Weight parameter for identifying far apart points. The weight is
                scaled by `1 / (number of clusters)`. Must be non-negative and defaults
                to 1.
            beta: Weight parameter for identifying central points. The weight is scaled
                by `1 / (number of samples)`. Must be non-negative and defaults to 1.
            gamma: Penalty parameter for the constraint that the solution contains
                exactly the specified number of clusters ($k$).The penalty is scaled
                by $\max_{i,j}\left(1-\exp(-\frac{1}{2}d_{ij})\right)$, where $d_{ij}$
                is the distance between data point $i$ and $j$. Must be non-negative and
                defaults to 2.
            metric: The metric to use when calculating distance between instances in a
                feature array, see :py:func:`sklearn.metrics.pairwise_distances`. If
                metric is a string, it must be one of the options allowed by
                :py:func:`scipy.spatial.distance.pdist`. If metric is ``"precomputed"``,
                X is assumed to be a distance matrix. If metric is a callable function,
                it is called on each pair of instances and the resulting values. The
                callable should take two arrays from X as input and return a value
                indicating their distance.
            solver_config: A QUBO solver configuration or None. In the former case
                includes name and options. If ``None`` is given, the default solver
                configuration as defined in
                :py:func:`~tno.quantum.ml.components.get_default_solver_config_if_none`
                will be used. Default is ``None``.

        Attributes:
            distance_: 2-D array of shape `(n_features, n_features)` containing the
                pairwise distances between each feature vector.
            medoid_indices_: 1-D array of length `n_clusters` containing the indices of
                the medoid row in the feature matrix $X$.
            cluster_centers_: 2-D array of shape `(n_cluster, n_features)` containing
                the cluster centers (elements from original dataset $X$).
            labels_: 1-D array of length `n_samples` containing the assigned labels for
                each sample.
            inertia_: The sum of distances of samples to their closest cluster center.
        """  # noqa: E501
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.metric = metric
        super().__init__(solver_config=solver_config)
        self.distance_: NDArray[np.float64]
        self.medoid_indices_: NDArray[np.int64]
        self.cluster_centers_: NDArray[np.float64]
        self.labels_: NDArray[np.int_]
        self.inertia_: float


    def _make_qubo(
        self, X: NDArray[np.float64], y: NDArray[np.float64] | None = None
    ) -> QUBO:
        """Create QUBO for k-medoid clustering.

        Args:
            X: Training instances to cluster. Shape is assumed to be
                `(n_samples, n_features)`.
            y: Ignored.

        Attributes:
            distance_: 2-D array of shape `(n_features, n_features)` containing the
                pairwise distances between each feature vector.

        Returns:
            A QUBO representation of K-medoid clustering.
        """
        # Check if the user has given valid input before doing any computations
        alpha = check_real(self.alpha, "alpha", l_bound=0)
        beta = check_real(self.beta, "beta", l_bound=0)
        gamma = check_real(self.gamma, "gamma", l_bound=0)
        n_clusters = check_int(self.n_clusters, "n_clusters", l_bound=1)

        # For each cluster, minimise the distance to the clusterhead
        self.distance_ = pairwise_distances(X, metric=self.metric)
        delta = 1 - np.exp(-0.5 * self.distance_)

        # Scale the hyper-parameters to the appropriate size
        n_features = len(X)
        scaled_alpha = alpha / n_clusters
        scaled_beta = beta / n_features
        scaled_gamma = gamma * np.max(delta)

        cost1 = scaled_gamma - 0.5 * scaled_alpha * delta
        cost2 = delta.sum(axis=0) * scaled_beta - 2 * scaled_gamma * self.n_clusters

        return QUBO(cost1 + np.diag(cost2))


[docs]
    def fit(self: QKMedoidsT, X: ArrayLike, y: ArrayLike | None = None) -> QKMedoidsT:
        """Fit the model to data matrix X.

        Args:
            X: The input data.
            y: Ignored.

        Returns:
            Returns self, a trained ``QKMedoids`` class.
        """
        return cast("QKMedoids", super().fit(X, y))


    def _check_X_and_y(  # noqa: N802
        self, X: NDArray[np.float64], y: NDArray[np.float64] | None = None
    ) -> None:
        """Check if `X` is a valid feature matrix.

        Args:
            X: The input data.
            y: Ignored.

        Raises:
            ValueError: If X does not contain at least 2 samples.
        """
        # Check at least 2 samples
        n_samples = len(X)
        if n_samples < 2:
            error_msg = (
                f"{self.__class__.__name__} requires at least 2 samples, "
                f"but got n_samples = {n_samples}."
            )
            raise ValueError(error_msg)

    def _check_constraints(self, bit_vector: BitVector) -> bool:
        """Check if the found bit vector satisfies the imposed constraints.

        In this model, the only constraint is that there are exactly `n_cluster` cluster
        heads assigned.

        Args:
            bit_vector: BitVector containing the solution to the QUBO.

        Returns:
            True if there are no violations, False otherwise
        """
        num_clusterheads = np.count_nonzero(bit_vector)
        if num_clusterheads != self.n_clusters:
            warnings.warn(
                stacklevel=2,
                message=f"Number of desired clusters is {self.n_clusters}, but "
                f"{num_clusterheads} clusters were assigned. Consider changing "
                "increasing 'gamma' or changing the number of clusters.",
            )
            return False
        return True

    def _decode_bit_vector(self: QKMedoidsT, bit_vector: BitVector) -> QKMedoidsT:
        """Decode bit vector and set attributes.

        Args:
            bit_vector : A bit vector containing the solution to the QUBO.

        Attributes:
            medoid_indices_: 1-D array of length `n_clusters` containing the indices of
                the medoid row in the feature matrix $X$.
            cluster_centers_: 2-D array of shape `(n_cluster, n_features)` containing
                the cluster centers (elements from original dataset $X$).
            labels_: 1-D array of length `n_samples` containing the assigned labels for
                each sample.
            inertia_: The sum of distances of samples to their closest cluster center.

        Returns:
            ``Self``
        """
        self.medoid_indices_ = np.flatnonzero(bit_vector)
        self.cluster_centers_ = self.X_[self.medoid_indices_]

        # Distance from each point to the medoids
        self.labels_ = self.distance_[:, self.medoid_indices_].argmin(axis=1)
        self.inertia_ = self._compute_inertia()

        return self

    def _compute_inertia(self) -> float:
        """Compute inertia of clustering.

        Inertia is defined as the sum of the sample distances to closest cluster
        centers.

        Returns:
            inertia: Sum of sample distances to closest cluster centers.
        """
        n_features = len(self.X_)
        data_indices = np.arange(n_features)
        cluster_indices = self.medoid_indices_[self.labels_]
        return float(self.distance_[data_indices, cluster_indices].sum())


[docs]
    def __sklearn_tags__(self) -> Tags:
        """Return estimator tags."""
        tags = super().__sklearn_tags__()
        solver_config = get_default_solver_config_if_none(self.solver_config)
        solver = solver_config.get_instance()
        tags.non_deterministic = solver.non_deterministic
        return tags