Source code for denspp.offline.metric.cluster_index
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics import calinski_harabasz_score, silhouette_score
[docs]
def calculate_euclidean_distance(point1: np.ndarray, point2: np.ndarray) -> float:
"""Calculate Euclidean distance between two points.
Args:
point1 (npt.NDArray): Point 1
point2 (npt.NDArray): Point 2
Raises:
ValueError: If point1 and point2 are not of equal size.
Returns:
float: Euclidean distance between point1 and point2
"""
return np.linalg.norm(point1 - point2)
[docs]
def calculate_dunn_index(data: np.ndarray, labels: np.ndarray) -> float:
"""Calculate the Dunn-index for a given dataset.
Args:
data (np.ndarray): array representing a cluster
-> each row describes a sample
-> each column represents a different feature
labels (np.ndarray): label of each sample
Returns:
float: Floating with metric value
"""
unique_labels = np.unique(labels)
if len(unique_labels) < 2:
return 0.0
clusters = [data[labels == label] for label in unique_labels]
intra_dists = []
for cluster in clusters:
if len(cluster) > 1:
diameter = np.max(cdist(cluster, cluster, metric="euclidean"))
intra_dists.append(diameter)
if not intra_dists:
return 0.0
# --- Inter-cluster distances ---
# Min. distance between any two clusters
inter_dists = []
for i in range(len(clusters)):
for j in range(i + 1, len(clusters)):
min_dist = np.min(cdist(clusters[i], clusters[j], metric="euclidean"))
inter_dists.append(min_dist)
# Calculate Dunn Index
max_intra = max(intra_dists)
min_inter = min(inter_dists) if inter_dists else 0.0
# Avoid division by zero
if max_intra == 0:
return 0.0
return float(min_inter / max_intra)
[docs]
def calculate_silhouette(data: np.ndarray, labels: np.ndarray) -> float:
"""Calculate the Silhouette index for a given dataset.
Args:
data (np.ndarray): array representing a cluster
-> each row describes a sample
-> each column represents a different feature
labels (np.ndarray): label of each sample
Returns:
float: Floating with metric value
"""
return silhouette_score(data, labels)
[docs]
def calculate_harabasz(data: np.ndarray, labels: np.ndarray) -> float:
"""Calculate the Calinski-Harabasz index for a given dataset.
Args:
data (np.ndarray): array representing a cluster
-> each row describes a sample
-> each column represents a different feature
labels (np.ndarray): label of each sample
Returns:
float: Floating with metric value
"""
return calinski_harabasz_score(data, labels)