Source code for bom_analyzer.analysis.optimization

from optuna.samplers import TPESampler
import umap.umap_ as umap
from hdbscan import HDBSCAN
from typing import *
import optuna
import numpy as np


# calculates the best parameters for clustering and dimension reduction
[docs]def optimize_hyperparameters(
        data: np.ndarray,
        seed: int,
        trials: int = 50
) -> Dict[str, Union[int, float]]:
    """
    Optimizes hyperparameters for UMAP and HDBSCAN using Optuna and the DBCV score as the objective function.

    Args:
        data (np.ndarray): The NumPy array containing the data to use for optimization.
        seed (int): The random seed for Optuna (for reproducibility).
        trials (int, optional): The number of hyperparameter configurations to try. Defaults to 50.

    Returns:
        Dict[str, Union[int, float]]:
            The dictionary containing the best hyperparameter values found during optimization.
    """

    sampler = TPESampler(seed=seed)
    study = optuna.create_study(sampler=sampler, direction='maximize')
    study.optimize(lambda trial: objective_function(trial, data, seed), n_trials=trials)
    return study.best_params


# runs umap and hdbscan with a set of parameters
# returns the validity score of the run
[docs]def objective_function(
        trial: optuna.Trial,
        data: np.ndarray,
        seed: int
) -> float:
    """
    Objective function used for hyperparameter optimization in `optimize_hyperparameters`.

    Args:
        trial (optuna.Trial): The Optuna trial object used for suggesting hyperparameters.
        data (np.ndarray): The NumPy array containing the data to use for evaluation.
        seed (int): The random seed for UMAP (for reproducibility).

    Returns:
        float: The DBCV score of the clustering results using the suggested hyperparameters.
    """

    min_cluster_size = trial.suggest_int('min_cluster_size', 2, data.shape[0]-2)
    min_samples = trial.suggest_int('min_samples', 1, data.shape[0]-2)
    alpha = trial.suggest_float('alpha', 0.0, 2.0)
    n_neighbors = trial.suggest_int('n_neighbors', 2, data.shape[0]-2)
    min_dist = trial.suggest_float('min_dist', 0.0, 0.99)

    umap_data = umap.UMAP(n_components=2,
                          n_neighbors=n_neighbors,
                          min_dist=min_dist,
                          random_state=seed,
                          n_jobs=1).fit_transform(data)

    hdb = HDBSCAN(min_cluster_size=min_cluster_size,
                  min_samples=min_samples,
                  alpha=alpha,
                  gen_min_span_tree=True)

    hdb.fit(umap_data)
    return hdb.relative_validity_