Source code for bom_analyzer.analysis.cluster

from umap.umap_ import UMAP
from hdbscan import HDBSCAN
import numpy as np

'''
param_dict is formatted in the same way as the optimizer,
shoud look like this

   param_dict = {
        'min_cluster_size': 48,
        'min_samples': 16,
        'alpha': 0.9615277268640865,
        'n_neighbors': 598,
        'min_dist': 0.9483669074161485
    }

'''


[docs]def dimension_reduction(st_data: np.ndarray, param_dict: dict, seed: int) -> np.ndarray: """ Reduces the dimensionality of a NumPy array containing sentence embeddings using UMAP. Args: st_data (np.ndarray): The NumPy array containing the sentence embeddings (assumed to have higher dimensionality). param_dict (dict): A dictionary containing hyperparameters for UMAP, including: - n_neighbors: The number of neighbors to consider for each data point. - min_dist: The minimum distance between embedded points. seed (int): The random seed for UMAP (for reproducibility). Returns: np.ndarray: The reduced-dimensionality NumPy array representing the data in 2D space. """ return UMAP(n_components=2, n_neighbors=param_dict['n_neighbors'], random_state=seed, min_dist=param_dict['min_dist'], n_jobs=1).fit_transform(st_data)
[docs]def clustering(umap_data: np.ndarray, param_dict: dict) -> np.ndarray: """ Performs clustering on a 2D NumPy array using HDBSCAN. Args: umap_data (np.ndarray): The 2D NumPy array containing the data points to cluster. param_dict (dict): A dictionary containing hyperparameters for HDBSCAN, including: - min_cluster_size: The minimum size of clusters. - min_samples: The minimum number of samples required to form a cluster. - alpha: The minimum span distance for DBSCAN. Returns: np.ndarray: A NumPy array containing cluster labels for each data point. """ hdb = HDBSCAN(min_cluster_size=param_dict['min_cluster_size'], min_samples=param_dict['min_samples'], alpha=param_dict['alpha'], gen_min_span_tree=True) return hdb.fit_predict(umap_data)