Source code for ltsm.data_provider.tokenizer.standard_scaler
import os
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler as SKStandardScaler
from ltsm.common.base_processor import BaseProcessor
from typing import Tuple, List
[docs]
class StandardScaler(BaseProcessor):
"""
Represents a Standard Scaler object that uses Sklearn's Standard Scaler for data processing.
Attributes:
module_id (str): The identifier for base processor objects.
"""
module_id = "standard_scaler"
def __init__(self):
self._scaler = None
[docs]
def process(self, raw_data: np.ndarray, train_data: List[np.ndarray], val_data: List[np.ndarray], test_data: List[np.ndarray], fit_train_only:bool=False, do_anomaly:bool=False)->Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
"""
Standardizes the training, validation, and test sets by removing the mean and scaling to unit variance.
Args:
raw_data (np.ndarray): The raw data.
train_data (List[np.ndarray]): The list of training sequences.
val_data (List[np.ndarray]): The list of validation sequences.
test_data (List[np.ndarray]): The list of test sequences.
fit_train_only (bool): Indicates whether the datasets should be scaled based on the training data.
Returns:
Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
A tuple of three lists containing the processed training, validation, and test data.
"""
scaled_train_data, scaled_val_data, scaled_test_data = [], [], []
for i, (raw_sequence, train_sequence, val_sequence, test_sequence) in enumerate(zip(
raw_data,
train_data,
val_data,
test_data,
)):
if do_anomaly and i == len(raw_data) - 1: # Skip anomaly label
scaled_train_data.append(train_sequence)
scaled_val_data.append(val_sequence)
scaled_test_data.append(test_sequence)
continue
train_sequence = train_sequence.reshape(-1, 1)
val_sequence = val_sequence.reshape(-1, 1)
test_sequence = test_sequence.reshape(-1, 1)
self._scaler = SKStandardScaler()
if fit_train_only:
self._scaler.fit(train_sequence)
else:
self._scaler.fit(raw_sequence.reshape(-1, 1))
scaled_train_data.append(self._scaler.transform(train_sequence).flatten())
scaled_val_data.append(self._scaler.transform(val_sequence).flatten())
scaled_test_data.append(self._scaler.transform(test_sequence).flatten())
return scaled_train_data, scaled_val_data, scaled_test_data
[docs]
def inverse_process(self, data: np.ndarray)->np.ndarray:
"""
Scales back the data to its original representation.
Args:
data (np.ndarray): The data to scale back.
Returns:
np.ndarray: The scaled back data.
"""
assert self._scaler is not None, "StandardScaler has not been fitted"
raw_shape = data.shape
data = self._scaler.inverse_transform(data.reshape(-1, 1))
return data.reshape(raw_shape)
[docs]
def save(self, save_dir: str):
"""
Saves the scaler to the save_dir directory as a Pickle file named processor.pkl.
Args:
save_dir (str): The directory where to store the scaler.
"""
save_path = os.path.join(save_dir, "processor.pkl")
with open(save_path, 'wb') as f:
pickle.dump(self._scaler, f)
[docs]
def load(self, save_dir):
"""
Loads the scaler saved at the save_dir directory.
Args:
save_dir (str): The directory the scaler was saved.
"""
save_path = os.path.join(save_dir, "processor.pkl")
with open(save_path, 'rb') as f:
self._scaler = pickle.load(f)