Source code for bom_analyzer.data.preprocess
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
[docs]def preprocess(
csv_path: str
) -> np.ndarray:
"""
Preprocesses a CSV file containing a bill of materials (BOM) for sentence transformation.
Args:
csv_path (str): The path to the CSV file.
Returns:
np.ndarray: A NumPy array containing the preprocessed BOM data, ready for sentence transformation.
"""
bom = pd.read_csv(csv_path, header=0, low_memory=False)
# Replace NaN values with a placeholder string,
# '#' made sense to me since it doesn't appear anywhere
# else in the data
if 'HWRMA' in bom.columns:
# If 'HWRMA' exists, drop it
bom = bom.fillna("#").drop('HWRMA', axis='columns')
arr = bom.astype(str).to_numpy()
return arr
[docs]def sentence_transform(
data: np.ndarray,
device: str
) -> np.ndarray:
"""
Encodes a NumPy array of product strings using a sentence transformer model.
Args:
data (np.ndarray): A NumPy array of product strings to encode.
device (str): The device to use for model computation (e.g., 'cpu' or 'cuda').
Returns:
np.ndarray: A NumPy array containing the encoded sentence embeddings.
"""
product_strings = [''.join(row) for row in data]
model_gte_large = SentenceTransformer('thenlper/gte-large')
st_data = model_gte_large.encode(product_strings, show_progress_bar=True, device=device)
return st_data