Source code for ltsm.data_provider.tokenizer.tokenizer_processor
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
[docs]
@dataclass
class TokenizerConfig:
"""
This class holds all the configuration parameters to be used
by ``ChronosTokenizer`` and ``ChronosModel``.
"""
tokenizer_class: str
tokenizer_kwargs: Dict[str, Any]
n_tokens: int
n_special_tokens: int
pad_token_id: int
eos_token_id: int
use_eos_token: bool
model_type: Literal["causal", "seq2seq"]
context_length: int
prediction_length: int
num_samples: int
temperature: float
top_k: int
top_p: float
def __post_init__(self):
assert (
self.pad_token_id < self.n_special_tokens
and self.eos_token_id < self.n_special_tokens
), f"Special token id's must be smaller than {self.n_special_tokens=}"
[docs]
def create_tokenizer(self) -> "ChronosTokenizer":
if self.tokenizer_class == "MeanScaleUniformBins":
return MeanScaleUniformBins(**self.tokenizer_kwargs, config=self)
raise ValueError
[docs]
class ChronosTokenizer:
"""
A ``ChronosTokenizer`` definines how time series are mapped into token IDs
and back.
For details, see the ``input_transform`` and ``output_transform`` methods,
which concrete classes must implement.
"""
[docs]
def input_transform(
self, context: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, Any]:
"""
Turn a batch of time series into token IDs, attention map, and scale.
Parameters
----------
context
A tensor shaped (batch_size, time_length), containing the
timeseries to forecast. Use left-padding with ``torch.nan``
to align time series of different lengths.
Returns
-------
token_ids
A tensor of integers, shaped (batch_size, time_length + 1)
if ``config.use_eos_token`` and (batch_size, time_length)
otherwise, containing token IDs for the input series.
attention_mask
A boolean tensor, same shape as ``token_ids``, indicating
which input observations are not ``torch.nan`` (i.e. not
missing nor padding).
tokenizer_state
An object that will be passed to ``output_transform``.
Contains the relevant context to decode output samples into
real values, such as location and scale parameters.
"""
raise NotImplementedError()
[docs]
def output_transform(
self, samples: torch.Tensor, tokenizer_state: Any
) -> torch.Tensor:
"""
Turn a batch of sample token IDs into real values.
Parameters
----------
samples
A tensor of integers, shaped (batch_size, num_samples, time_length),
containing token IDs of sample trajectories.
tokenizer_state
An object returned by ``input_transform`` containing
relevant context to decode samples, such as location and scale.
The nature of this depends on the specific tokenizer.
Returns
-------
forecasts
A real tensor, shaped (batch_size, num_samples, time_length),
containing forecasted sample paths.
"""
raise NotImplementedError()
[docs]
class MeanScaleUniformBins(ChronosTokenizer):
def __init__(
self, low_limit: float, high_limit: float, config: TokenizerConfig
) -> None:
self.config = config
self.centers = torch.linspace(
low_limit,
high_limit,
config.n_tokens - config.n_special_tokens - 1,
)
self.boundaries = torch.concat(
(
torch.tensor([-1e20], device=self.centers.device),
(self.centers[1:] + self.centers[:-1]) / 2,
torch.tensor([1e20], device=self.centers.device),
)
)
[docs]
def input_transform(
self, context: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
batch_size, length = context.shape
if length > self.config.context_length:
context = context[..., -self.config.context_length :]
attention_mask = ~torch.isnan(context)
scale = torch.nansum(
torch.abs(context) * attention_mask, dim=-1
) / torch.nansum(attention_mask, dim=-1)
scale[~(scale > 0)] = 1.0
scaled_context = context / scale.unsqueeze(dim=-1)
token_ids = (
torch.bucketize(
input=scaled_context,
boundaries=self.boundaries,
# buckets are open to the right, see:
# https://pytorch.org/docs/2.1/generated/torch.bucketize.html#torch-bucketize
right=True,
)
+ self.config.n_special_tokens
)
token_ids[~attention_mask] = self.config.pad_token_id
if self.config.use_eos_token:
eos_tokens = torch.full(
(batch_size, 1), fill_value=self.config.eos_token_id
)
token_ids = torch.concat((token_ids, eos_tokens), dim=1)
eos_mask = torch.full((batch_size, 1), fill_value=True)
attention_mask = torch.concat((attention_mask, eos_mask), dim=1)
return token_ids, attention_mask, scale
[docs]
def output_transform(
self, samples: torch.Tensor, scale: torch.Tensor
) -> torch.Tensor:
scale_unsqueezed = scale.unsqueeze(-1).unsqueeze(-1)
indices = torch.clamp(
samples - self.config.n_special_tokens,
min=0,
max=len(self.centers) - 1,
)
self.centers = self.centers.to(samples.device)
return self.centers[indices] * scale_unsqueezed