Source code for bom_analyzer.analysis.outlier_detection

import pandas as pd
import numpy as np
from tqdm import trange
from typing import *


# this is a bit of a hacky way to do this, but it works
# given a set of cluster labels, this function outputs all
# the components to a new dataframe
[docs]def group_components( table: pd.DataFrame, labels: np.ndarray ) -> pd.DataFrame: """ Groups components from a DataFrame based on specified cluster labels and extracts relevant data. Args: table (pd.DataFrame): The input DataFrame containing product data. labels (np.ndarray): A NumPy array containing cluster labels to group components by. Returns: pd.DataFrame: A new DataFrame containing the grouped components with columns: - CPN: Component part number - DateCode: Manufacturing date code - LOTCODE: Lot code - MPN: Manufacturer part number - RD: Revision date """ # find the index of all columns that contain CPN delims = parse_columns(table) # create a dataframe to output to components = pd.DataFrame(columns=['CPN', 'DateCode', 'LOTCODE', 'MPN', 'RD']) # create a copy of the original table that contains only the clusters in labels copy = table[table.CLUSTERS.isin(labels)] for i in trange(len(delims) - 1): temp = pd.DataFrame(columns=['CPN', 'DateCode', 'LOTCODE', 'MPN', 'RD']) for j in range(delims[i], delims[i + 1]): col = copy.columns[j].partition("_")[0] temp[col] = copy.iloc[:, j].values temp = temp.dropna(how='all') if not temp.empty: components = pd.concat([components, temp], ignore_index=True) return components
# return a list of the indices of all columns that have a # title starting with "CPN" and the index of the "HWRMA" column # used by group_components()
[docs]def parse_columns( table: pd.DataFrame ) -> List[int]: """ Finds the indices of columns starting with "CPN" and the "HWRMA" column in a DataFrame. Helper function used by 'group_components'. Args: table (pd.DataFrame): The input DataFrame. Returns: list: A list of column indices, including those starting with "CPN" and the "HWRMA" column. """ cols = table.columns.tolist() delims = [] for i in range(len(cols)): if cols[i].startswith("CPN"): delims.append(i) delims.append(table.columns.get_loc("HWRMA")) return delims