Source code for bom_analyzer.data.preprocess

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer



[docs]def preprocess( csv_path: str ) -> np.ndarray: """ Preprocesses a CSV file containing a bill of materials (BOM) for sentence transformation. Args: csv_path (str): The path to the CSV file. Returns: np.ndarray: A NumPy array containing the preprocessed BOM data, ready for sentence transformation. """ bom = pd.read_csv(csv_path, header=0, low_memory=False) # Replace NaN values with a placeholder string, # '#' made sense to me since it doesn't appear anywhere # else in the data if 'HWRMA' in bom.columns: # If 'HWRMA' exists, drop it bom = bom.fillna("#").drop('HWRMA', axis='columns') arr = bom.astype(str).to_numpy() return arr
[docs]def sentence_transform( data: np.ndarray, device: str ) -> np.ndarray: """ Encodes a NumPy array of product strings using a sentence transformer model. Args: data (np.ndarray): A NumPy array of product strings to encode. device (str): The device to use for model computation (e.g., 'cpu' or 'cuda'). Returns: np.ndarray: A NumPy array containing the encoded sentence embeddings. """ product_strings = [''.join(row) for row in data] model_gte_large = SentenceTransformer('thenlper/gte-large') st_data = model_gte_large.encode(product_strings, show_progress_bar=True, device=device) return st_data