Source code for denspp.offline.metric.cluster_index

from math import sqrt
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score, calinski_harabasz_score


[docs] def calculate_euclidean_distance(point1: np.ndarray, point2: np.ndarray) -> float: """Calculate Euclidean distance between two points. Args: point1 (npt.NDArray): Point 1 point2 (npt.NDArray): Point 2 Raises: ValueError: If point1 and point2 are not of equal size. Returns: float: Euclidean distance between point1 and point2 """ temp_sum = 0 for f1, f2 in zip(point1, point2, strict=True): temp_sum += (f1 - f2) ** 2 return sqrt(temp_sum)
[docs] def calculate_dunn_index(data: np.ndarray, labels: np.ndarray) -> float: """Calculate the Dunn-index for a given dataset. Args: data (np.ndarray): array representing a cluster -> each row describes a sample -> each column represents a different feature labels (np.ndarray): label of each sample Returns: float: Floating with metric value """ unique_labels = np.unique(labels) clusters = [data[labels == label] for label in unique_labels] intra_dists = [np.max(cdist(cluster, cluster)) for cluster in clusters if len(cluster) > 1] inter_dists = [] for i in range(len(clusters)): for j in range(i + 1, len(clusters)): dist = np.min(cdist(clusters[i], clusters[j])) inter_dists.append(dist) max_intra = max(intra_dists) if intra_dists else 1e-10 min_inter = min(inter_dists) if inter_dists else 1e-10 return float(min_inter / max_intra)
[docs] def calculate_silhouette(data: np.ndarray, labels: np.ndarray) -> float: """Calculate the Silhouette index for a given dataset. Args: data (np.ndarray): array representing a cluster -> each row describes a sample -> each column represents a different feature labels (np.ndarray): label of each sample Returns: float: Floating with metric value """ return silhouette_score(data, labels)
[docs] def calculate_harabasz(data: np.ndarray, labels: np.ndarray) -> float: """Calculate the Calinski-Harabasz index for a given dataset. Args: data (np.ndarray): array representing a cluster -> each row describes a sample -> each column represents a different feature labels (np.ndarray): label of each sample Returns: float: Floating with metric value """ return calinski_harabasz_score(data, labels)