Skip to content

Commit

Permalink
add utsd dataloader
Browse files Browse the repository at this point in the history
  • Loading branch information
WenWeiTHU committed Jun 19, 2024
1 parent f907eaf commit ff2253d
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 91 deletions.
99 changes: 11 additions & 88 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This repo provides official code and checkpoints for [Timer: Generative Pre-trai

# Updates

:triangular_flag_on_post: **News** (2024.6) Pre-training dataset (UTSD) is available in [HuggingFace](https://huggingface.co/datasets/thuml/UTSD)!
:triangular_flag_on_post: **News** (2024.6) Pre-training dataset (UTSD) is available in [HuggingFace](https://huggingface.co/datasets/thuml/UTSD). UTSD dataloader is contained in this repo.

:triangular_flag_on_post: **News** (2024.5) Accepted by ICML 2024, a [camera-ready version](https://arxiv.org/abs/2402.02368) of **31 pages**.

Expand Down Expand Up @@ -32,94 +32,17 @@ Our dataset is released in [HuggingFace](https://huggingface.co/datasets/thuml/U

### Usage

You can load UTSD in the style of [Time-Series-Library](https://github.com/thuml/Time-Series-Library) based on the following dataset code:

```python
import datasets
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

class UTSDDataset(Dataset):
def __init__(self, remote=True, root_path=r'UTSD-1G', flag='train', input_len=None, pred_len=None, scale=True,
stride=1, split=0.9):
self.input_len = input_len
self.pred_len = pred_len
self.seq_len = input_len + pred_len
assert flag in ['train', 'val']
assert split >= 0 and split <=1.0
type_map = {'train': 0, 'val': 1, 'test': 2}
self.set_type = type_map[flag]
self.flag = flag
self.scale = scale
self.split = split
self.stride = stride
self.remote = remote

self.data_list = []
self.n_window_list = []

self.root_path = root_path
self.__read_data__()

def __read_data__(self):
if self.remote:
dataset = datasets.load_dataset("thuml/UTSD", "UTSD-1G")['train']
else:
dataset = datasets.load_from_disk(self.root_path)

print(dataset)
for item in tqdm(dataset):
self.scaler = StandardScaler()
data = item['target']
data = np.array(data).reshape(-1, 1)
num_train = int(len(data) * self.split)
border1s = [0, num_train - self.seq_len]
border2s = [num_train, len(data)]

border1 = border1s[self.set_type]
border2 = border2s[self.set_type]

if self.scale:
train_data = data[border1s[0]:border2s[0]]
self.scaler.fit(train_data)
data = self.scaler.transform(data)

data = data[border1:border2]
n_window = (len(data) - self.seq_len) // self.stride + 1
if n_window < 1:
continue

self.data_list.append(data)
self.n_window_list.append(n_window if len(self.n_window_list) == 0 else self.n_window_list[-1] + n_window)


def __getitem__(self, index):
dataset_index = 0
while index >= self.n_window_list[dataset_index]:
dataset_index += 1

index = index - self.n_window_list[dataset_index - 1] if dataset_index > 0 else index
n_timepoint = (len(self.data_list[dataset_index]) - self.seq_len) // self.stride + 1

s_begin = index % n_timepoint
s_begin = self.stride * s_begin
s_end = s_begin + self.seq_len
p_begin = s_end
p_end = p_begin + self.pred_len
seq_x = self.data_list[dataset_index][s_begin:s_end, :]
seq_y = self.data_list[dataset_index][p_begin:p_end, :]

return seq_x, seq_y

def __len__(self):
return self.n_window_list[-1]

dataset = UTSDDataset(input_len=1440, pred_len=96)
print(len(dataset))
```
You can access and load UTSD in the style of [Time-Series-Library](https://github.com/thuml/Time-Series-Library) based on the following:

```bash
# huggingface-cli login
# export HF_ENDPOINT=https://hf-mirror.com

python ./scripts/UTSD/download_dataset.py

# dataloader
python ./scripts/UTSD/utsdataset.py
```


## Tasks
Expand Down
37 changes: 37 additions & 0 deletions scripts/UTSD/download_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# if you want to download the dataset, you can run this script:
# '''python download_dataset.py'''

# if you meet with some network problems, you can set the mirror site before running the script:
# export HF_ENDPOINT=https://hf-mirror.com

import datasets

ds = datasets.load_dataset("thuml/UTSD", "UTSD-1G")
# ds = datasets.load_dataset("thuml/UTSD", "UTSD-2G")
# ds = datasets.load_dataset("thuml/UTSD", "UTSD-4G")
# ds = datasets.load_dataset("thuml/UTSD", "UTSD-12G")

# the dataset have not been divided into train, test, and val splits
# therefore, ds['train'] contains all the time series
# you can split them by yourself, or use our default split as train:val=9:1 in '''utsdataset.py'''
all = ds['train']

# print the total number of time series
print(f'total {len(all)} single-variate series')

# each item is a single-variate series containing:
# 1. dataset name (item_id)
# 2. start time (start)
# 3. end time (end)
# 4. sampling frequecy (freq)
# 5. time series values (target)
# timestampes are optional since some datasets are irregular and may not have

# see https://huggingface.co/datasets/thuml/UTSD/viewer for more details`
print(all[0].keys())

# you can access the time series values by item['target']
num_timepoints = len(all[0]['target'])
print(f'the first time series containing {num_timepoints} time points')

# or generate the timestamps by item['start'], item['end'], and item['freq']
91 changes: 91 additions & 0 deletions scripts/UTSD/utsdataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@


import datasets
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


"""
All single-variate series in UTSD are divided into (input-output) windows with a uniform length based on S3.
"""
class UTSDataset(Dataset):
def __init__(self, subset_name=r'UTSD-1G', flag='train', split=0.9,
input_len=None, output_len=None, scale=True, stride=1):
self.input_len = input_len
self.output_len = output_len
self.seq_len = input_len + output_len
assert flag in ['train', 'val']
assert split >= 0 and split <=1.0
type_map = {'train': 0, 'val': 1, 'test': 2}
self.set_type = type_map[flag]
self.flag = flag
self.scale = scale
self.split = split
self.stride = stride

self.data_list = []
self.n_window_list = []

self.subset_name = subset_name
self.__read_data__()

def __read_data__(self):
dataset = datasets.load_dataset("thuml/UTSD", self.subset_name, split='train')
# split='train' contains all the time series, which have not been divided into splits,
# you can split them by yourself, or use our default split as train:val = 9:1
print('Indexing dataset...')
for item in tqdm(dataset):
self.scaler = StandardScaler()
data = item['target']
data = np.array(data).reshape(-1, 1)
num_train = int(len(data) * self.split)
border1s = [0, num_train - self.seq_len]
border2s = [num_train, len(data)]

border1 = border1s[self.set_type]
border2 = border2s[self.set_type]

if self.scale:
train_data = data[border1s[0]:border2s[0]]
self.scaler.fit(train_data)
data = self.scaler.transform(data)

data = data[border1:border2]
n_window = (len(data) - self.seq_len) // self.stride + 1
if n_window < 1:
continue

self.data_list.append(data)
self.n_window_list.append(n_window if len(self.n_window_list) == 0 else self.n_window_list[-1] + n_window)


def __getitem__(self, index):
# you can wirte your own processing code here
dataset_index = 0
while index >= self.n_window_list[dataset_index]:
dataset_index += 1

index = index - self.n_window_list[dataset_index - 1] if dataset_index > 0 else index
n_timepoint = (len(self.data_list[dataset_index]) - self.seq_len) // self.stride + 1

s_begin = index % n_timepoint
s_begin = self.stride * s_begin
s_end = s_begin + self.seq_len
p_begin = s_end
p_end = p_begin + self.output_len
seq_x = self.data_list[dataset_index][s_begin:s_end, :]
seq_y = self.data_list[dataset_index][p_begin:p_end, :]

return seq_x, seq_y

def __len__(self):
return self.n_window_list[-1]


# See ```download_dataset.py``` to download the dataset first
if __name__ == '__main__':
# dataset = UTSDataset(subset_name=r'UTSD-1G', input_len=672, output_len=0, flag='train')
dataset = UTSDataset(subset_name=r'UTSD-1G', input_len=720, output_len=96, flag='train')
print(f'total {len(dataset)} time series windows (sentence)')
2 changes: 1 addition & 1 deletion scripts/imputation/ETT.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ python -u run.py \
--features M \
--seq_len 192 \
--label_len 0 \
--pred_len 192 \ # not used in imputation
--pred_len 192 \ # not used in imputation
--patch_len $patch_len \
--e_layers $e_layers \
--factor 3 \
Expand Down
2 changes: 1 addition & 1 deletion scripts/imputation/PEMS.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ python -u run.py \
--features M \
--seq_len 192 \
--label_len 0 \
--pred_len 192 \ # not used in imputation
--pred_len 192 \ # not used in imputation
--patch_len $patch_len \
--e_layers $e_layers \
--factor 3 \
Expand Down
2 changes: 1 addition & 1 deletion scripts/imputation/Weather.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ python -u run.py \
--features M \
--seq_len 192 \
--label_len 0 \
--pred_len 192 \ # not used in imputation
--pred_len 192 \ # not used in imputation
--patch_len $patch_len \
--e_layers $e_layers \
--factor 3 \
Expand Down

0 comments on commit ff2253d

Please sign in to comment.