diff --git a/datasets/cbm_dataframe_parser.py b/datasets/cbm_dataframe_parser.py new file mode 100644 index 000000000..7991dbc2a --- /dev/null +++ b/datasets/cbm_dataframe_parser.py @@ -0,0 +1,427 @@ +################################################################################################### +# +# Copyright (C) 2024 Analog Devices, Inc. All Rights Reserved. +# This software is proprietary to Analog Devices, Inc. and its licensors. +# +################################################################################################### +""" +Main classes and functions for Motor Data Dataset +""" +import math +import os +import pickle + +import numpy as np +import torch +from numpy.fft import fft +from torch.utils.data import Dataset + +import pandas as pd +import scipy + +from utils.dataloader_utils import makedir_exist_ok + + +class CbM_DataFrame_Parser(Dataset): # pylint: disable=too-many-instance-attributes + """ + The base dataset class for motor vibration data used in Condition Based Monitoring. + Includes main preprocessing functions. + Expects a dataframe with common_dataframe_columns. + """ + + common_dataframe_columns = ["file_identifier", "raw_data_vib_in_g", "sensor_sr_Hz", + "speed", "load", "label"] + + @staticmethod + def sliding_windows_1d(array, window_size, overlap_ratio): + """ + One dimensional array is windowed and returned + in window_size length according to overlap ratio. + """ + + window_overlap = math.ceil(window_size * overlap_ratio) + + slide_amount = window_size - window_overlap + num_of_windows = math.floor((len(array) - window_size) / slide_amount) + 1 + + result_list = np.zeros((num_of_windows, window_size)) + + for i in range(num_of_windows): + start_idx = slide_amount * i + end_idx = start_idx + window_size + result_list[i] = array[start_idx:end_idx] + + return result_list + + @staticmethod + def sliding_windows_on_columns_of_2d(array, window_size, overlap_ratio): + """ + Two dimensional array is windowed and returned + in window_size length according to overlap ratio. + """ + + array_len, num_of_cols = array.shape + + window_overlap = math.ceil(window_size * overlap_ratio) + slide_amount = window_size - window_overlap + num_of_windows = math.floor((array_len - window_size) / slide_amount) + 1 + + result_list = np.zeros((num_of_cols, num_of_windows, window_size)) + + for i in range(num_of_cols): + result_list[i, :, :] = CbM_DataFrame_Parser.sliding_windows_1d( + array[:, i], + window_size, overlap_ratio + ) + + return result_list + + @staticmethod + def split_file_raw_data(file_raw_data, file_raw_data_fs_in_Hz, duration_in_sec, overlap_ratio): + """ + Raw data is split into windowed data. + """ + + num_of_samples_per_window = int(file_raw_data_fs_in_Hz * duration_in_sec) + + sliding_windows = CbM_DataFrame_Parser.sliding_windows_on_columns_of_2d( + file_raw_data, + num_of_samples_per_window, + overlap_ratio + ) + + return sliding_windows + + def process_file_and_return_signal_windows(self, file_raw_data): + """ + Windowed signals are constructed from 2D raw data. + Fast Fourier Transform performed on these signals. + """ + + new_sampling_rate = int(self.selected_sensor_sr / self.downsampling_ratio) + + file_raw_data_sampled = scipy.signal.decimate(file_raw_data, + self.downsampling_ratio, axis=0) + + file_raw_data_windows = self.split_file_raw_data( + file_raw_data_sampled, + new_sampling_rate, + self.signal_duration_in_sec, + self.overlap_ratio + ) + + # First dimension: 3 + # Second dimension: number of windows + # Third dimension: Window for self.duration_in_sec. 1000 samples for default settings + num_features = file_raw_data_windows.shape[0] + num_windows = file_raw_data_windows.shape[1] + + fft_output_window_size = self.cnn_1dinput_len + + file_cnn_signals = np.zeros((num_features, num_windows, fft_output_window_size)) + + # Perform FFT on each window () for each feature + for window in range(num_windows): + for feature in range(num_features): + + signal_for_fft = file_raw_data_windows[feature, window, :] + + fft_out = abs(fft(signal_for_fft)) + fft_out = fft_out[:fft_output_window_size] + + fft_out[:self.num_start_zeros] = 0 + fft_out[-self.num_end_zeros:] = 0 + + file_cnn_signals[feature, window, :] = fft_out + + file_cnn_signals[:, window, :] = file_cnn_signals[:, window, :] / \ + np.sqrt(np.power(file_cnn_signals[:, window, :], 2).sum()) + + # Reshape from (num_features, num_windows, window_size) into: + # (num_windows, num_features, window_size) + file_cnn_signals = file_cnn_signals.transpose([1, 0, 2]) + + return file_cnn_signals + + def create_common_empty_df(self): + """ + Create empty dataframe + """ + df = pd.DataFrame(columns=self.common_dataframe_columns) + return df + + def __init__(self, root, d_type, + transform=None, + target_sampling_rate_Hz=2000, + signal_duration_in_sec=0.25, + overlap_ratio=0.75, + eval_mode=False, + label_as_signal=True, + random_or_speed_split=True, + speed_and_load_available=False, + num_end_zeros=10, + num_start_zeros=3, + train_ratio=0.8, + cnn_1dinput_len=256, + main_df=None + ): + + if d_type not in ('test', 'train'): + raise ValueError( + "d_type can only be set to 'test' or 'train'" + ) + + self.main_df = main_df + self.df_normals = self.main_df[main_df['label'] == 0] + self.df_anormals = self.main_df[main_df['label'] == 1] + + self.normal_speeds_Hz = list(set(self.df_normals['speed'])) + self.normal_speeds_Hz.sort() + self.normal_test_speeds = self.normal_speeds_Hz[1::5] + self.normal_train_speeds = list(set(self.normal_speeds_Hz) - set(self.normal_test_speeds)) + self.normal_train_speeds.sort() + + self.selected_sensor_sr = self.df_normals['sensor_sr_Hz'][0] + self.num_end_zeros = num_end_zeros + self.num_start_zeros = num_start_zeros + self.train_ratio = train_ratio + + self.root = root + self.d_type = d_type + self.transform = transform + + self.signal_duration_in_sec = signal_duration_in_sec + self.overlap_ratio = overlap_ratio + + self.eval_mode = eval_mode + self.label_as_signal = label_as_signal + + self.random_or_speed_split = random_or_speed_split + self.speed_and_load_available = speed_and_load_available + + self.num_of_features = 3 + + self.target_sampling_rate_Hz = target_sampling_rate_Hz + self.downsampling_ratio = round(self.selected_sensor_sr / + self.target_sampling_rate_Hz) + + self.cnn_1dinput_len = cnn_1dinput_len + + cnn_assert_message = "CNN input length is incorrect." + assert self.cnn_1dinput_len >= (self.target_sampling_rate_Hz * + self.signal_duration_in_sec)/2, cnn_assert_message + + if not isinstance(self.downsampling_ratio, int) or self.downsampling_ratio < 1: + raise ValueError( + "downsampling_ratio can only be set to an integer value greater than 0" + ) + + processed_folder = \ + os.path.join(root, self.__class__.__name__, 'processed') + + self.processed_folder = processed_folder + + makedir_exist_ok(self.processed_folder) + + self.specs_identifier = f'eval_mode_{self.eval_mode}_' + \ + f'label_as_signal_{self.label_as_signal}_' + \ + f'ds_{self.downsampling_ratio}_' + \ + f'dur_{self.signal_duration_in_sec}_' + \ + f'ovlp_ratio_{self.overlap_ratio}_' + \ + f'random_split_{self.random_or_speed_split}_' + + train_dataset_pkl_file_path = \ + os.path.join(self.processed_folder, f'train_{self.specs_identifier}.pkl') + + test_dataset_pkl_file_path = \ + os.path.join(self.processed_folder, f'test_{self.specs_identifier}.pkl') + + if self.d_type == 'train': + self.dataset_pkl_file_path = train_dataset_pkl_file_path + + elif self.d_type == 'test': + self.dataset_pkl_file_path = test_dataset_pkl_file_path + + self.signal_list = [] + self.lbl_list = [] + self.speed_list = [] + self.load_list = [] + + self.__create_pkl_files() + self.is_truncated = False + + def __create_pkl_files(self): + if os.path.exists(self.dataset_pkl_file_path): + + print('\nPickle files are already generated ...\n') + + (self.signal_list, self.lbl_list, self.speed_list, self.load_list) = \ + pickle.load(open(self.dataset_pkl_file_path, 'rb')) + return + + self.__gen_datasets() + + def normalize_signal(self, features): + """ + Normalize signal with Local Min Max Normalization + """ + # Normalize data: + for instance in range(features.shape[0]): + instance_max = np.max(features[instance, :, :], axis=1) + instance_min = np.min(features[instance, :, :], axis=1) + + for feature in range(features.shape[1]): + for signal in range(features.shape[2]): + features[instance, feature, signal] = ( + (features[instance, feature, signal] - instance_min[feature]) / + (instance_max[feature] - instance_min[feature]) + ) + + return features + + def __gen_datasets(self): + + train_features = [] + test_normal_features = [] + + train_speeds = [] + test_normal_speeds = [] + + train_loads = [] + test_normal_loads = [] + + for _, row in self.df_normals.iterrows(): + raw_data = row['raw_data_vib_in_g'] + cnn_signals = self.process_file_and_return_signal_windows(raw_data) + file_speed = row['speed'] + file_load = row['load'] + + if self.random_or_speed_split: + num_training = int(self.train_ratio * cnn_signals.shape[0]) + + for i in range(cnn_signals.shape[0]): + if i < num_training: + train_features.append(cnn_signals[i]) + train_speeds.append(file_speed) + train_loads.append(file_load) + else: + test_normal_features.append(cnn_signals[i]) + test_normal_speeds.append(file_speed) + test_normal_loads.append(file_load) + + else: + # split test-train using file identifiers and split + if file_speed in self.normal_train_speeds: + for i in range(cnn_signals.shape[0]): + train_features.append(cnn_signals[i]) + train_speeds.append(file_speed) + train_loads.append(file_load) + + else: # file_speed in normal_test_speeds + for i in range(cnn_signals.shape[0]): + test_normal_features.append(cnn_signals[i]) + test_normal_speeds.append(file_speed) + test_normal_loads.append(file_load) + + train_features = np.asarray(train_features) + test_normal_features = np.asarray(test_normal_features) + + anomaly_features = [] + test_anormal_speeds = [] + test_anormal_loads = [] + + for _, row in self.df_anormals.iterrows(): + raw_data = row['raw_data_vib_in_g'] + cnn_signals = self.process_file_and_return_signal_windows(raw_data) + file_speed = row['speed'] + file_load = row['load'] + + for i in range(cnn_signals.shape[0]): + anomaly_features.append(cnn_signals[i]) + test_anormal_speeds.append(file_speed) + test_anormal_loads.append(file_load) + + anomaly_features = np.asarray(anomaly_features) + + train_features = self.normalize_signal(train_features) + test_normal_features = self.normalize_signal(test_normal_features) + anomaly_features = self.normalize_signal(anomaly_features) + + # For eliminating filter effects + train_features[:, :, :self.num_start_zeros] = 0.5 + train_features[:, :, -self.num_end_zeros:] = 0.5 + + test_normal_features[:, :, :self.num_start_zeros] = 0.5 + test_normal_features[:, :, -self.num_end_zeros:] = 0.5 + + anomaly_features[:, :, :self.num_start_zeros] = 0.5 + anomaly_features[:, :, -self.num_end_zeros:] = 0.5 + + # ARRANGE TEST-TRAIN SPLIT AND LABELS + if self.d_type == 'train': + self.lbl_list = [train_features[i, :, :] for i in range(train_features.shape[0])] + self.signal_list = [torch.Tensor(label) for label in self.lbl_list] + self.lbl_list = list(self.signal_list) + self.speed_list = np.array(train_speeds) + self.load_list = np.array(train_loads) + + if not self.label_as_signal: + self.lbl_list = np.zeros([len(self.signal_list), 1]) + + elif self.d_type == 'test': + + # Testing in training phase includes only normal test samples + if not self.eval_mode: + test_data = test_normal_features + else: + test_data = np.concatenate((test_normal_features, anomaly_features), axis=0) + + self.lbl_list = [test_data[i, :, :] for i in range(test_data.shape[0])] + self.signal_list = [torch.Tensor(label) for label in self.lbl_list] + self.lbl_list = list(self.signal_list) + self.speed_list = np.concatenate((np.array(test_normal_speeds), + np.array(test_anormal_speeds))) + self.load_list = np.concatenate((np.array(test_normal_loads), + np.array(test_anormal_loads))) + + if not self.label_as_signal: + self.lbl_list = np.concatenate( + (np.zeros([len(test_normal_features), 1]), + np.ones([len(anomaly_features), 1])), axis=0) + # Save pickle file + pickle.dump((self.signal_list, self.lbl_list, self.speed_list, self.load_list), + open(self.dataset_pkl_file_path, 'wb')) + + def __len__(self): + if self.is_truncated: + return 1 + return len(self.signal_list) + + def __getitem__(self, index): + if index >= len(self): + raise IndexError + + if self.is_truncated: + index = 0 + + signal = self.signal_list[index] + lbl = self.lbl_list[index] + + if self.transform is not None: + signal = self.transform(signal) + + if self.label_as_signal: + lbl = self.transform(lbl) + + if not self.label_as_signal: + lbl = lbl.astype(np.long) + else: + lbl = lbl.numpy().astype(np.float32) + + if self.speed_and_load_available: + speed = self.speed_list[index] + load = self.load_list[index] + + return signal, lbl, speed, load + + return signal, lbl diff --git a/datasets/samplemotordatalimerick.py b/datasets/samplemotordatalimerick.py index 2fab1686d..6128b90fd 100755 --- a/datasets/samplemotordatalimerick.py +++ b/datasets/samplemotordatalimerick.py @@ -8,26 +8,23 @@ Classes and functions for the Sample Motor Data Limerick Dataset https://github.com/analogdevicesinc/CbM-Datasets """ -import errno -import math import os -import pickle import numpy as np import torch -from numpy.fft import fft -from torch.utils.data import Dataset from torchvision import transforms import git import pandas as pd -import scipy from git.exc import GitCommandError import ai8x +from utils.dataloader_utils import makedir_exist_ok +from .cbm_dataframe_parser import CbM_DataFrame_Parser -class SampleMotorDataLimerick(Dataset): + +class SampleMotorDataLimerick(CbM_DataFrame_Parser): """ Sample motor data is collected using SpectraQuest Machinery Fault Simulator. ADXL356 sensor data is used for vibration raw data. @@ -35,242 +32,62 @@ class SampleMotorDataLimerick(Dataset): data csv files recorded for 2 sec in X, Y and Z direction. """ - sensor_sr_Hz = 20000 - # Good Bearing, Good Shaft, Balanced Load and Well Aligned healthy_file_identifier = '_GoB_GS_BaLo_WA_' - cnn_1dinput_len = 256 - num_end_zeros = 10 num_start_zeros = 3 - common_dataframe_columns = ["sensor_identifier", "file_identifier", "raw_data_accel_in_g"] - train_ratio = 0.8 - @staticmethod - def sliding_windows_1d(array, window_size, overlap_ratio): - """ - One dimensional array is windowed and returned - in window_size length according to overlap ratio. - """ - - window_overlap = math.ceil(window_size * overlap_ratio) - - slide_amount = window_size - window_overlap - num_of_windows = math.floor((len(array) - window_size) / slide_amount) + 1 - - result_list = np.zeros((num_of_windows, window_size)) - - for i in range(num_of_windows): - start_idx = slide_amount * i - end_idx = start_idx + window_size - result_list[i] = array[start_idx:end_idx] - - return result_list - - @staticmethod - def sliding_windows_on_columns_of_2d(array, window_size, overlap_ratio): - """ - Two dimensional array is windowed and returned - in window_size length according to overlap ratio. - """ - - array_len, num_of_cols = array.shape - - window_overlap = math.ceil(window_size * overlap_ratio) - slide_amount = window_size - window_overlap - num_of_windows = math.floor((array_len - window_size) / slide_amount) + 1 - - result_list = np.zeros((num_of_cols, num_of_windows, window_size)) - - for i in range(num_of_cols): - result_list[i, :, :] = SampleMotorDataLimerick.sliding_windows_1d( - array[:, i], - window_size, overlap_ratio - ) - - return result_list - - @staticmethod - def split_file_raw_data(file_raw_data, file_raw_data_fs_in_Hz, duration_in_sec, overlap_ratio): - """ - Raw data is split into windowed data. - """ - - num_of_samples_per_window = int(file_raw_data_fs_in_Hz * duration_in_sec) - - sliding_windows = SampleMotorDataLimerick.sliding_windows_on_columns_of_2d( - file_raw_data, - num_of_samples_per_window, - overlap_ratio - ) - - return sliding_windows - - def process_file_and_return_signal_windows(self, file_raw_data): - """ - Windowed signals are constructed from 2D raw data. - Fast Fourier Transform performed on these signals. - """ - - new_sampling_rate = int(self.selected_sensor_sr / self.downsampling_ratio) - - file_raw_data_sampled = scipy.signal.decimate(file_raw_data, - self.downsampling_ratio, axis=0) - - file_raw_data_windows = SampleMotorDataLimerick.split_file_raw_data( - file_raw_data_sampled, - new_sampling_rate, - self.signal_duration_in_sec, - self.overlap_ratio - ) - - # First dimension: 3 - # Second dimension: number of windows - # Third dimension: Window for self.duration_in_sec. 1000 samples for default settings - num_features = file_raw_data_windows.shape[0] - num_windows = file_raw_data_windows.shape[1] - - fft_output_window_size = SampleMotorDataLimerick.cnn_1dinput_len - - file_cnn_signals = np.zeros((num_features, num_windows, fft_output_window_size)) - - # Perform FFT on each window () for each feature - for window in range(num_windows): - for feature in range(num_features): - - signal_for_fft = file_raw_data_windows[feature, window, :] - - fft_out = abs(fft(signal_for_fft)) - fft_out = fft_out[:fft_output_window_size] - - fft_out[:SampleMotorDataLimerick.num_start_zeros] = 0 - fft_out[-SampleMotorDataLimerick.num_end_zeros:] = 0 - - file_cnn_signals[feature, window, :] = fft_out - - file_cnn_signals[:, window, :] = file_cnn_signals[:, window, :] / \ - np.sqrt(np.power(file_cnn_signals[:, window, :], 2).sum()) - - # Reshape from (num_features, num_windows, window_size) into: - # (num_windows, num_features, window_size) - file_cnn_signals = file_cnn_signals.transpose([1, 0, 2]) - - return file_cnn_signals - - @staticmethod - def create_common_empty_df(): - """ - Create empty dataframe - """ - df = pd.DataFrame(columns=SampleMotorDataLimerick.common_dataframe_columns) - return df - - @staticmethod - def parse_ADXL356C_and_return_common_df_row(file_full_path): - """ - Columns added just for readability can return raw data np array as well, - can also add file identifier - """ - df_raw = pd.read_csv(file_full_path, sep=';', header=None) - df_raw.rename( - columns={0: 'Time', 1: 'Voltage_x', 2: 'Voltage_y', - 3: 'Voltage_z', 4: 'x', 5: 'y', 6: 'z'}, - inplace=True - ) - ss_vibr_x1 = df_raw.iloc[0]['x'] - ss_vibr_y1 = df_raw.iloc[0]['y'] - ss_vibr_z1 = df_raw.iloc[0]['z'] - df_raw["Acceleration_x (g)"] = 50 * (df_raw["Voltage_x"] - ss_vibr_x1) - df_raw["Acceleration_y (g)"] = 50 * (df_raw["Voltage_y"] - ss_vibr_y1) - df_raw["Acceleration_z (g)"] = 50 * (df_raw["Voltage_z"] - ss_vibr_z1) - - raw_data = df_raw[["Acceleration_x (g)", "Acceleration_y (g)", "Acceleration_z (g)"]] - raw_data = raw_data.to_numpy() - return ['ADXL356C', os.path.basename(file_full_path).split('/')[-1], raw_data] - - def __makedir_exist_ok(self, dirpath): - try: - os.makedirs(dirpath) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - - def __init__(self, root, d_type, transform=None, + def __init__(self, root, d_type, + transform, + target_sampling_rate_Hz, + signal_duration_in_sec, + overlap_ratio, + eval_mode, + label_as_signal, + random_or_speed_split, + speed_and_load_available, + num_end_zeros=num_end_zeros, + num_start_zeros=num_start_zeros, + train_ratio=train_ratio, + accel_in_second_dim=True, download=True, - downsampling_ratio=2, - signal_duration_in_sec=0.25, - overlap_ratio=0.75, - eval_mode=False, - label_as_signal=True, - random_or_speed_split=True, - accel_in_second_dim=True): - - if d_type not in ('test', 'train'): - raise ValueError( - "d_type can only be set to 'test' or 'train'" - ) - - if not isinstance(downsampling_ratio, int) or downsampling_ratio < 1: - raise ValueError( - "downsampling_ratio can only be set to an integer value greater than 0" - ) - - self.selected_sensor_sr = SampleMotorDataLimerick.sensor_sr_Hz + healthy_file_identifier=healthy_file_identifier, + cnn_1dinput_len=256): - self.root = root - self.d_type = d_type - self.transform = transform self.download = download - - self.downsampling_ratio = downsampling_ratio - self.signal_duration_in_sec = signal_duration_in_sec - self.overlap_ratio = overlap_ratio - - self.eval_mode = eval_mode - self.label_as_signal = label_as_signal - - self.random_or_speed_split = random_or_speed_split - self.accel_in_second_dim = accel_in_second_dim - - self.num_of_features = 3 + self.root = root if self.download: self.__download() - processed_folder = \ - os.path.join(root, self.__class__.__name__, 'processed') - - self.__makedir_exist_ok(processed_folder) - - self.specs_identifier = f'eval_mode_{self.eval_mode}_' + \ - f'label_as_signal_{self.label_as_signal}_' + \ - f'ds_{self.downsampling_ratio}_' + \ - f'dur_{self.signal_duration_in_sec}_' + \ - f'ovlp_ratio_{self.overlap_ratio}_' + \ - f'random_split_{self.random_or_speed_split}_' - - train_dataset_pkl_file_path = \ - os.path.join(processed_folder, f'train_{self.specs_identifier}.pkl') - - test_dataset_pkl_file_path = \ - os.path.join(processed_folder, f'test_{self.specs_identifier}.pkl') - - if self.d_type == 'train': - self.dataset_pkl_file_path = train_dataset_pkl_file_path - - elif self.d_type == 'test': - self.dataset_pkl_file_path = test_dataset_pkl_file_path + self.accel_in_second_dim = accel_in_second_dim - self.signal_list = [] - self.lbl_list = [] + self.processed_folder = \ + os.path.join(root, self.__class__.__name__, 'processed') - self.__create_pkl_files() - self.is_truncated = False + self.healthy_file_identifier = healthy_file_identifier + self.target_sampling_rate_Hz = target_sampling_rate_Hz + self.signal_duration_in_sec = signal_duration_in_sec + main_df = self.gen_dataframe() + + super().__init__(root, + d_type=d_type, + transform=transform, + target_sampling_rate_Hz=target_sampling_rate_Hz, + signal_duration_in_sec=signal_duration_in_sec, + overlap_ratio=overlap_ratio, + eval_mode=eval_mode, + label_as_signal=label_as_signal, + random_or_speed_split=random_or_speed_split, + speed_and_load_available=speed_and_load_available, + num_end_zeros=num_end_zeros, + num_start_zeros=num_start_zeros, + train_ratio=train_ratio, + cnn_1dinput_len=cnn_1dinput_len, + main_df=main_df) def __download(self): """ @@ -280,7 +97,7 @@ def __download(self): destination_folder = self.root dataset_repository = 'https://github.com/analogdevicesinc/CbM-Datasets' - self.__makedir_exist_ok(destination_folder) + makedir_exist_ok(destination_folder) try: if not os.path.exists(os.path.join(destination_folder, 'SampleMotorDataLimerick')): @@ -294,42 +111,69 @@ def __download(self): except GitCommandError: pass - def __create_pkl_files(self): - if os.path.exists(self.dataset_pkl_file_path): + def parse_ADXL356C_and_return_common_df_row(self, file_full_path, sensor_sr_Hz, + speed=None, load=None, label=None): + """ + Dataframe parser for Sample Motor Data Limerick. + Reads csv files and returns file identifier, raw data, + sensor frequency, speed, load and label. + The aw data size must be consecutive and bigger than window size. + """ + df_raw = pd.read_csv(file_full_path, sep=';', header=None) + + df_raw.rename( + columns={0: 'Time', 1: 'Voltage_x', 2: 'Voltage_y', + 3: 'Voltage_z', 4: 'x', 5: 'y', 6: 'z'}, + inplace=True + ) + ss_vibr_x1 = df_raw.iloc[0]['x'] + ss_vibr_y1 = df_raw.iloc[0]['y'] + ss_vibr_z1 = df_raw.iloc[0]['z'] + df_raw["Acceleration_x (g)"] = 50 * (df_raw["Voltage_x"] - ss_vibr_x1) + df_raw["Acceleration_y (g)"] = 50 * (df_raw["Voltage_y"] - ss_vibr_y1) + df_raw["Acceleration_z (g)"] = 50 * (df_raw["Voltage_z"] - ss_vibr_z1) + + raw_data = df_raw[["Acceleration_x (g)", "Acceleration_y (g)", "Acceleration_z (g)"]] + raw_data = raw_data.to_numpy() - print('\nPickle files are already generated ...\n') + window_size_assert_message = "CNN input length is incorrect." + assert self.signal_duration_in_sec <= (raw_data.shape[0] / + sensor_sr_Hz), window_size_assert_message - (self.signal_list, self.lbl_list) = \ - pickle.load(open(self.dataset_pkl_file_path, 'rb')) - return + return [os.path.basename(file_full_path).split('/')[-1], + raw_data, sensor_sr_Hz, speed, load, label] - self.__gen_datasets() + def __getitem__(self, index): + if self.accel_in_second_dim and not self.speed_and_load_available: + signal, lbl = super().__getitem__(index) # pylint: disable=unbalanced-tuple-unpacking + signal = torch.transpose(signal, 0, 1) + lbl = lbl.transpose() + return signal, lbl + if self.accel_in_second_dim and self.speed_and_load_available: + signal, lbl, speed, load = super().__getitem__(index) + signal = torch.transpose(signal, 0, 1) + lbl = lbl.transpose() + return signal, lbl, speed, load + return super().__getitem__(index) - def normalize_signal(self, features): + def gen_dataframe(self): """ - Normalize signal with Local Min Max Normalization + Generate dataframes from csv files of Sample Motor Data Limerick """ - # Normalize data: - for instance in range(features.shape[0]): - instance_max = np.max(features[instance, :, :], axis=1) - instance_min = np.min(features[instance, :, :], axis=1) - - for feature in range(features.shape[1]): - for signal in range(features.shape[2]): - features[instance, feature, signal] = ( - (features[instance, feature, signal] - instance_min[feature]) / - (instance_max[feature] - instance_min[feature]) - ) + file_name = f'{self.__class__.__name__}_dataframe.pkl' + df_path = \ + os.path.join(self.root, self.__class__.__name__, file_name) - return features + if os.path.isfile(df_path): + print(f'\n{file_name} file already exists\n') + main_df = pd.read_pickle(df_path) - def __gen_datasets(self): - print('\nGenerating dataset pickle files from the raw data ' - f'files (specs identifier: {self.specs_identifier}) ...\n') + return main_df + + print('\nGenerating data frame pickle files from the raw data \n') actual_root_dir = os.path.join(self.root, self.__class__.__name__, "SpectraQuest_Rig_Data_Voyager_3/") - data_dir = os.path.join(actual_root_dir, 'Data_ADXL356C') if not os.listdir(data_dir): @@ -337,162 +181,64 @@ def __gen_datasets(self): rpm_prefixes = ('0600', '1200', '1800', '2400', '3000') + sensor_sr_Hz = 20000 # Hz + faulty_data_list = [] healthy_data_list = [] - df_normals = SampleMotorDataLimerick.create_common_empty_df() - df_anormals = SampleMotorDataLimerick.create_common_empty_df() + df_normals = self.create_common_empty_df() + df_anormals = self.create_common_empty_df() for file in os.listdir(data_dir): full_path = os.path.join(data_dir, file) + speed = int(file.split("_")[0]) / 60 # Hz + load = int(file.split("_")[-1][0:2]) # LBS - if any(file.startswith(rpm_prefix + SampleMotorDataLimerick.healthy_file_identifier) + if any(file.startswith(rpm_prefix + self.healthy_file_identifier) for rpm_prefix in rpm_prefixes): - healthy_row = SampleMotorDataLimerick.parse_ADXL356C_and_return_common_df_row( - file_full_path=full_path - ) - + healthy_row = self.parse_ADXL356C_and_return_common_df_row( + file_full_path=full_path, sensor_sr_Hz=sensor_sr_Hz, + speed=speed, + load=load, + label=0 + ) healthy_data_list.append(healthy_row) else: - faulty_row = SampleMotorDataLimerick.parse_ADXL356C_and_return_common_df_row( - file_full_path=full_path + faulty_row = self.parse_ADXL356C_and_return_common_df_row( + file_full_path=full_path, sensor_sr_Hz=sensor_sr_Hz, + speed=speed, + load=load, + label=1 ) - faulty_data_list.append(faulty_row) - # Can keep and process those further df_normals = pd.DataFrame(data=np.array(healthy_data_list, dtype=object), - columns=SampleMotorDataLimerick.common_dataframe_columns) - df_anormals = pd.DataFrame(data=np.array(faulty_data_list, dtype=object), - columns=SampleMotorDataLimerick.common_dataframe_columns) - - # LOAD NORMAL FEATURES - test_train_idx_max = 4 - test_train_idx = 0 # 0, 1, 2 : train, 3: test - - train_features = [] - test_normal_features = [] - - for _, row in df_normals.iterrows(): - raw_data = row['raw_data_accel_in_g'] - cnn_signals = self.process_file_and_return_signal_windows(raw_data) - if self.random_or_speed_split: - num_training = int(SampleMotorDataLimerick.train_ratio * cnn_signals.shape[0]) - - for i in range(cnn_signals.shape[0]): - if i < num_training: - train_features.append(cnn_signals[i]) - else: - test_normal_features.append(cnn_signals[i]) - else: - if test_train_idx < test_train_idx_max - 1: - for i in range(cnn_signals.shape[0]): - train_features.append(cnn_signals[i]) - else: - for i in range(cnn_signals.shape[0]): - test_normal_features.append(cnn_signals[i]) - - test_train_idx = (test_train_idx + 1) % test_train_idx_max - - train_features = np.asarray(train_features) - test_normal_features = np.asarray(test_normal_features) - - anomaly_features = [] - - for _, row in df_anormals.iterrows(): - raw_data = row['raw_data_accel_in_g'] - cnn_signals = self.process_file_and_return_signal_windows(raw_data) - for i in range(cnn_signals.shape[0]): - anomaly_features.append(cnn_signals[i]) - - anomaly_features = np.asarray(anomaly_features) - - train_features = self.normalize_signal(train_features) - test_normal_features = self.normalize_signal(test_normal_features) - anomaly_features = self.normalize_signal(anomaly_features) - - # For eliminating filter effects - train_features[:, :, :SampleMotorDataLimerick.num_start_zeros] = 0.5 - train_features[:, :, -SampleMotorDataLimerick.num_end_zeros:] = 0.5 - - test_normal_features[:, :, :SampleMotorDataLimerick.num_start_zeros] = 0.5 - test_normal_features[:, :, -SampleMotorDataLimerick.num_end_zeros:] = 0.5 - - anomaly_features[:, :, :SampleMotorDataLimerick.num_start_zeros] = 0.5 - anomaly_features[:, :, -SampleMotorDataLimerick.num_end_zeros:] = 0.5 - - # ARRANGE TEST-TRAIN SPLIT AND LABELS - if self.d_type == 'train': - self.lbl_list = [train_features[i, :, :] for i in range(train_features.shape[0])] - self.signal_list = [torch.Tensor(label) for label in self.lbl_list] - self.lbl_list = list(self.signal_list) - - if not self.label_as_signal: - self.lbl_list = np.zeros([len(self.signal_list), 1]) - - elif self.d_type == 'test': - - # Testing in training phase includes only normal test samples - if not self.eval_mode: - test_data = test_normal_features - else: - test_data = np.concatenate((test_normal_features, anomaly_features), axis=0) - - self.lbl_list = [test_data[i, :, :] for i in range(test_data.shape[0])] - self.signal_list = [torch.Tensor(label) for label in self.lbl_list] - self.lbl_list = list(self.signal_list) - - if not self.label_as_signal: - self.lbl_list = np.concatenate( - (np.zeros([len(test_normal_features), 1]), - np.ones([len(anomaly_features), 1])), axis=0) - # Save pickle file - pickle.dump((self.signal_list, self.lbl_list), open(self.dataset_pkl_file_path, 'wb')) - - def __len__(self): - if self.is_truncated: - return 1 - return len(self.signal_list) - - def __getitem__(self, index): - if index >= len(self): - raise IndexError - - if self.is_truncated: - index = 0 + columns=self.common_dataframe_columns) - signal = self.signal_list[index] - lbl = self.lbl_list[index] + df_anormals = pd.DataFrame(data=np.array(faulty_data_list, dtype=object), + columns=self.common_dataframe_columns) - if self.transform is not None: - signal = self.transform(signal) + main_df = pd.concat([df_normals, df_anormals], axis=0) - if self.label_as_signal: - lbl = self.transform(lbl) + makedir_exist_ok(self.processed_folder) + main_df.to_pickle(df_path) - if not self.label_as_signal: - lbl = lbl.astype(np.long) - else: - lbl = lbl.numpy().astype(np.float32) - - if self.accel_in_second_dim: - signal = torch.transpose(signal, 0, 1) - lbl = lbl.transpose() - - return signal, lbl + return main_df def samplemotordatalimerick_get_datasets(data, load_train=True, load_test=True, download=True, - downsampling_ratio=10, signal_duration_in_sec=0.25, overlap_ratio=0.75, eval_mode=False, label_as_signal=True, random_or_speed_split=True, - accel_in_second_dim=True): - """" + speed_and_load_available=False, + accel_in_second_dim=True, + target_sampling_rate_Hz=2000, + cnn_1dinput_len=256): + """ Returns Sample Motor Data Limerick Dataset """ (data_dir, args) = data @@ -505,13 +251,15 @@ def samplemotordatalimerick_get_datasets(data, load_train=True, load_test=True, train_dataset = SampleMotorDataLimerick(root=data_dir, d_type='train', download=download, transform=train_transform, - downsampling_ratio=downsampling_ratio, signal_duration_in_sec=signal_duration_in_sec, overlap_ratio=overlap_ratio, eval_mode=eval_mode, label_as_signal=label_as_signal, random_or_speed_split=random_or_speed_split, - accel_in_second_dim=accel_in_second_dim) + speed_and_load_available=speed_and_load_available, + accel_in_second_dim=accel_in_second_dim, + target_sampling_rate_Hz=target_sampling_rate_Hz, + cnn_1dinput_len=cnn_1dinput_len) print(f'Train dataset length: {len(train_dataset)}\n') else: @@ -525,13 +273,15 @@ def samplemotordatalimerick_get_datasets(data, load_train=True, load_test=True, test_dataset = SampleMotorDataLimerick(root=data_dir, d_type='test', download=download, transform=test_transform, - downsampling_ratio=downsampling_ratio, signal_duration_in_sec=signal_duration_in_sec, overlap_ratio=overlap_ratio, eval_mode=eval_mode, label_as_signal=label_as_signal, random_or_speed_split=random_or_speed_split, - accel_in_second_dim=accel_in_second_dim) + speed_and_load_available=speed_and_load_available, + accel_in_second_dim=accel_in_second_dim, + target_sampling_rate_Hz=target_sampling_rate_Hz, + cnn_1dinput_len=cnn_1dinput_len) print(f'Test dataset length: {len(test_dataset)}\n') else: @@ -553,9 +303,8 @@ def samplemotordatalimerick_get_datasets_for_train(data, signal_duration_in_sec = 0.25 overlap_ratio = 0.75 - wanted_sampling_rate_Hz = 2000 - downsampling_ratio = round(SampleMotorDataLimerick.sensor_sr_Hz / - wanted_sampling_rate_Hz) + target_sampling_rate_Hz = 2000 + cnn_1dinput_len = 256 # ds_ratio = 10, sr: 20K / 10 = 2000, 0.25 sec window, fft input will have: 500 samples, # fftout's first 256 samples will be used @@ -564,15 +313,18 @@ def samplemotordatalimerick_get_datasets_for_train(data, accel_in_second_dim = True random_or_speed_split = True + speed_and_load_available = False return samplemotordatalimerick_get_datasets(data, load_train, load_test, - downsampling_ratio=downsampling_ratio, signal_duration_in_sec=signal_duration_in_sec, overlap_ratio=overlap_ratio, eval_mode=eval_mode, label_as_signal=label_as_signal, random_or_speed_split=random_or_speed_split, - accel_in_second_dim=accel_in_second_dim) + speed_and_load_available=speed_and_load_available, + accel_in_second_dim=accel_in_second_dim, + target_sampling_rate_Hz=target_sampling_rate_Hz, + cnn_1dinput_len=cnn_1dinput_len) def samplemotordatalimerick_get_datasets_for_eval_with_anomaly_label(data, @@ -589,26 +341,28 @@ def samplemotordatalimerick_get_datasets_for_eval_with_anomaly_label(data, signal_duration_in_sec = 0.25 overlap_ratio = 0.75 - wanted_sampling_rate_Hz = 2000 - downsampling_ratio = round(SampleMotorDataLimerick.sensor_sr_Hz / - wanted_sampling_rate_Hz) + target_sampling_rate_Hz = 2000 + cnn_1dinput_len = 256 # ds_ratio = 10, sr: 20K / 10 = 2000, 0.25 sec window, fft input will have: 500 samples, # fftout's first 256 samples will be used - # cnn input will have 2556 samples + # cnn input will have 256 samples accel_in_second_dim = True random_or_speed_split = True + speed_and_load_available = False return samplemotordatalimerick_get_datasets(data, load_train, load_test, - downsampling_ratio=downsampling_ratio, signal_duration_in_sec=signal_duration_in_sec, overlap_ratio=overlap_ratio, eval_mode=eval_mode, label_as_signal=label_as_signal, random_or_speed_split=random_or_speed_split, - accel_in_second_dim=accel_in_second_dim) + speed_and_load_available=speed_and_load_available, + accel_in_second_dim=accel_in_second_dim, + target_sampling_rate_Hz=target_sampling_rate_Hz, + cnn_1dinput_len=cnn_1dinput_len) def samplemotordatalimerick_get_datasets_for_eval_with_signal(data, @@ -625,26 +379,28 @@ def samplemotordatalimerick_get_datasets_for_eval_with_signal(data, signal_duration_in_sec = 0.25 overlap_ratio = 0.75 - wanted_sampling_rate_Hz = 2000 - downsampling_ratio = round(SampleMotorDataLimerick.sensor_sr_Hz / - wanted_sampling_rate_Hz) + target_sampling_rate_Hz = 2000 + cnn_1dinput_len = 256 # ds_ratio = 10, sr: 20K / 10 = 2000, 0.25 sec window, fft input will have: 500 samples, # fftout's first 256 samples will be used - # cnn input will have 2556 samples + # cnn input will have 256 samples accel_in_second_dim = True random_or_speed_split = True + speed_and_load_available = False return samplemotordatalimerick_get_datasets(data, load_train, load_test, - downsampling_ratio=downsampling_ratio, signal_duration_in_sec=signal_duration_in_sec, overlap_ratio=overlap_ratio, eval_mode=eval_mode, label_as_signal=label_as_signal, random_or_speed_split=random_or_speed_split, - accel_in_second_dim=accel_in_second_dim) + speed_and_load_available=speed_and_load_available, + accel_in_second_dim=accel_in_second_dim, + target_sampling_rate_Hz=target_sampling_rate_Hz, + cnn_1dinput_len=cnn_1dinput_len) datasets = [ diff --git a/notebooks/AutoEncoder_Evaluation.ipynb b/notebooks/AutoEncoder_Evaluation.ipynb index cda423032..1b40d00d9 100755 --- a/notebooks/AutoEncoder_Evaluation.ipynb +++ b/notebooks/AutoEncoder_Evaluation.ipynb @@ -2,13 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "###################################################################################################\n", "#\n", - "# Copyright (C) 2023 Analog Devices, Inc. All Rights Reserved.\n", + "# Copyright (C) 2024 Analog Devices, Inc. All Rights Reserved.\n", "# This software is proprietary and confidential to Analog Devices, Inc. and its licensors.\n", "#\n", "###################################################################################################\n", @@ -20,13 +20,9 @@ "import os\n", "import sys\n", "\n", - "import math\n", "import numpy as np\n", "import torch\n", "\n", - "import importlib\n", - "\n", - "import matplotlib.patches as patches\n", "import matplotlib.pyplot as plt\n", "\n", "sys.path.append(os.path.dirname(os.getcwd()))\n", @@ -38,7 +34,6 @@ "\n", "import parse_qat_yaml\n", "import ai8x\n", - "from torch import nn\n", "\n", "from torch.utils import data\n", "\n", @@ -53,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -72,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -100,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -110,6 +105,9 @@ "\n", "SampleMotorDataLimerick dataset already downloaded...\n", "\n", + "SampleMotorDataLimerick_dataframe.pkl file already exists\n", + "\n", + "\n", "Pickle files are already generated ...\n", "\n", "Train dataset length: 230\n", @@ -117,6 +115,9 @@ "\n", "SampleMotorDataLimerick dataset already downloaded...\n", "\n", + "SampleMotorDataLimerick_dataframe.pkl file already exists\n", + "\n", + "\n", "Pickle files are already generated ...\n", "\n", "Test dataset length: 3540\n", @@ -131,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -150,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -172,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -182,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -223,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -245,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -266,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -275,7 +276,7 @@ "0.08606820548770566" ] }, - "execution_count": 36, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -286,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -306,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -328,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -349,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -369,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -399,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -408,7 +409,7 @@ "0.01594951049141262" ] }, - "execution_count": 42, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -420,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -429,7 +430,7 @@ "0.022111524475945367" ] }, - "execution_count": 43, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -449,7 +450,7 @@ "60" ] }, - "execution_count": 44, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -460,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -487,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -496,7 +497,7 @@ "Text(0.5, 0, 'Reconstruction Loss (RL)')" ] }, - "execution_count": 46, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, @@ -526,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 23, "metadata": {}, "outputs": [ { diff --git a/utils/autoencoder_eval_utils.py b/utils/autoencoder_eval_utils.py index a2dd9e4c6..bb2660f8a 100755 --- a/utils/autoencoder_eval_utils.py +++ b/utils/autoencoder_eval_utils.py @@ -55,6 +55,8 @@ def extract_reconstructions_losses(model, dataloader, device): signal, label = tup elif len(tup) == 3: signal, label, _ = tup + elif len(tup) == 4: + signal, label, _, _ = tup signal = signal.to(device) label = label.type(torch.long).to(device) diff --git a/utils/dataloader_utils.py b/utils/dataloader_utils.py new file mode 100644 index 000000000..33adca3e4 --- /dev/null +++ b/utils/dataloader_utils.py @@ -0,0 +1,22 @@ +################################################################################################### +# +# Copyright (C) 2024 Analog Devices, Inc. All Rights Reserved. +# This software is proprietary to Analog Devices, Inc. and its licensors. +# +################################################################################################### +"""Data loader utils functions""" +import errno +import os + + +def makedir_exist_ok(dirpath): + """ + Creates directory path + """ + try: + os.makedirs(dirpath) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise