Source code for ltsm.data_provider.data_splitter

from ltsm.common.base_splitter import DataSplitter
import pandas as pd
import numpy as np

from typing import Tuple, List
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

[docs] class SplitterByTimestamp(DataSplitter): """ Data splitter class that splits time-series data by timestamp. """ def __init__(self, seq_len: int, pred_len: int, train_ratio: float, val_ratio: float): """ Initializes the SplitterByTimestamp with the given arguments. Args: seq_len (int): The number of timesteps used in the input sequence. pred_len (int): The number of timesteps the model should predict for the output sequence. train_ratio (float): The training set ratio. val_ratio (float): The validation set ratio. """ super().__init__() self.seq_len = seq_len self.pred_len = pred_len self.train_ratio = train_ratio self.val_ratio = val_ratio
[docs] def get_csv_splits(self, df_data: pd.DataFrame, do_anomaly: bool=False) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]: """ Splits the .csv data into training-validation-training sets. Args: df_data (pd.DataFrame): A Pandas DataFrame containing the data to be split. Returns: Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]: A tuple containing fours lists of sequences for the training, validation, and test sets. The last list contains the row labels of these sequences. """ train_split, val_split, test_split, buff = [], [], [], [] raw_data = df_data.to_numpy() for index, sequence in zip(df_data.index, raw_data): if len(sequence) > 0 and isinstance(sequence[0], np.ndarray): logging.error("Time-series should be 1D.") raise ValueError("Time-series should be 1D.") num_train = int(len(sequence) * self.train_ratio) num_val = int(len(sequence) * self.val_ratio) if not do_anomaly: if num_train < self.seq_len + self.pred_len: continue else: if num_train < self.seq_len: continue # We also add the previous seq_len points to the val and test sets train_split.append(sequence[:num_train]) val_split.append(sequence[num_train-self.seq_len:num_train+num_val]) test_split.append(sequence[num_train+num_val-self.seq_len:]) buff.append(index) return train_split, val_split, test_split, buff