diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l001.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l001.npy deleted file mode 100644 index ff42e63..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l001.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l002.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l002.npy deleted file mode 100644 index cc5c03d..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l002.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l004.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l004.npy deleted file mode 100644 index 76bc054..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l004.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l008.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l008.npy deleted file mode 100644 index 57b2589..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l008.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l016.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l016.npy deleted file mode 100644 index a23b9ba..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l016.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l032.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l032.npy deleted file mode 100644 index e53b02d..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l032.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l064.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l064.npy deleted file mode 100644 index 3583e7d..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l064.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l128.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l128.npy deleted file mode 100644 index 0b066e0..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42-nts=10000_l128.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42.npy deleted file mode 100644 index 0532460..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l001.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l001.npy deleted file mode 100644 index a748126..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l001.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l002.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l002.npy deleted file mode 100644 index 24368a6..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l002.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l004.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l004.npy deleted file mode 100644 index ad4e535..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l004.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l008.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l008.npy deleted file mode 100644 index 99a289f..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l008.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l016.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l016.npy deleted file mode 100644 index 656f4ac..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l016.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l032.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l032.npy deleted file mode 100644 index 1d09e29..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l032.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l064.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l064.npy deleted file mode 100644 index 069f8a4..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l064.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l128.npy b/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l128.npy deleted file mode 100644 index 5ee6594..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=20000-dcs=2048-max=32768-min=2048-s=42_l128.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy deleted file mode 100644 index 64fd249..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l001.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l001.npy deleted file mode 100644 index 426e6e1..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l001.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l002.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l002.npy deleted file mode 100644 index 344f928..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l002.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l004.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l004.npy deleted file mode 100644 index bde1ccf..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l004.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l008.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l008.npy deleted file mode 100644 index 70060f0..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l008.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l016.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l016.npy deleted file mode 100644 index 462483f..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l016.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l032.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l032.npy deleted file mode 100644 index f21c3e5..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l032.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l064.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l064.npy deleted file mode 100644 index 6d990ff..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l064.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l128.npy b/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l128.npy deleted file mode 100644 index b830844..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-mts=40000-dcs=1024-max=32768-min=2048-s=42_l128.npy and /dev/null differ diff --git a/src/dataloaders/d=redpajama_train_50000-nts=20000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy b/src/dataloaders/d=redpajama_train_50000-nts=20000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy deleted file mode 100644 index 802ab34..0000000 Binary files a/src/dataloaders/d=redpajama_train_50000-nts=20000-mts=40000-dcs=1024-max=32768-min=2048-s=42.npy and /dev/null differ diff --git a/src/dataloaders/preprocess_rp_contig.py b/src/dataloaders/preprocess_rp_contig.py index bb5e2c7..3f0401a 100644 --- a/src/dataloaders/preprocess_rp_contig.py +++ b/src/dataloaders/preprocess_rp_contig.py @@ -208,10 +208,12 @@ def main(): _data_attr = distill_config['dataset']['dataset_config']['train_data'] _data_attr = '-d='.join(_data_attr).replace('/', '_').replace('.json', '') _data_attr = _data_attr.replace('[','_').replace(']','') + + dataset_config = distill_config.dataset.dataset_config # fname = f'd={_data_attr}-nts={num_train_samples}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' fname = f'd={_data_attr}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' - fname = join('./src/dataloaders', fname) + fname = join(dataset_config['dataloaders_dir'], 'redpajama_sample_indices', fname) # Rank samples by effective sequence length _train_esl = train_esl.mean(0).mean(0).mean(-1) # num_samples diff --git a/src/dataloaders/redpajama_sample_contig.py b/src/dataloaders/redpajama_sample_contig.py index a0b80dd..1a4202e 100644 --- a/src/dataloaders/redpajama_sample_contig.py +++ b/src/dataloaders/redpajama_sample_contig.py @@ -116,13 +116,11 @@ def load_data(name: str, dataset_config: dict, pretrained_model_config: dict, _data_attr = _data_attr.replace('[','_').replace(']','') # fname = f'd={_data_attr}-nts={num_train_samples}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' - fname = f'd={_data_attr}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' - fname = join(dataset_config['dataloaders_dir'], fname) try: fname = f'd={_data_attr}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' - fname = join(dataset_config['dataloaders_dir'], fname) + fname = join(dataset_config['dataloaders_dir'], 'redpajama_sample_indices', fname) if dataset_config['filter_window'] > 0: sorted_idx = np.load(f'{fname}_l{window:03d}.npy') else: @@ -145,14 +143,31 @@ def load_data(name: str, dataset_config: dict, pretrained_model_config: dict, _train_esl = train_esl.mean(0).mean(0).mean(-1) # num_samples sorted_idx = torch.argsort(_train_esl, dim=-1, descending=True) # Save indices to generated filename + fname = f'd={_data_attr}-mts={max_train_samples}-dcs={chunk_size}-max={max_length}-min={min_length}-s={seed}' + fname = join(dataset_config['dataloaders_dir'], 'redpajama_sample_indices', fname) np.save(f'{fname}.npy', sorted_idx) print(f'-> Top {num_train_samples} saved to {fname}!') - - # _train_esl = train_esl[..., -128:].mean(0).mean(0).mean(-1) # num_samples - # sorted_idx = torch.argsort(_train_esl, dim=-1, descending=True) - # # Save indices to generated filename - # np.save(f'{fname}_l128.npy', sorted_idx) - # print(f'-> Top {num_train_samples} saved to {fname}!') + + # Also sort by computing sequence lengths over last window tokens + for window in [1, 2, 4, 8, 16, 32, 64, 128]: + _train_esl = train_esl[..., -window:].mean(0).mean(0).mean(-1) # num_samples + sorted_idx = torch.argsort(_train_esl, dim=-1, descending=True) + # Save indices to generated filename + try: + _fname = f'{fname}_l{window:03d}.npy' + np.save(_fname, sorted_idx) + print(f'-> Samples saved to {_fname}!') + + # Also save top samples + sample_idx = sorted_idx[:num_train_samples].numpy() + _fname = f'{fname}-nts={num_train_samples}_l{window:03d}.npy' + np.save(_fname, sample_idx) # sorted_idx) + print(f'-> Top {num_train_samples} saved to {_fname}!') + except: + sample_idx = sorted_idx[:num_train_samples].numpy() + _fname = f'{fname}-nts={num_train_samples}_l{window:03d}.npy' + np.save(_fname, sample_idx) # sorted_idx) + print(f'-> Top {num_train_samples} saved to {_fname}!') sample_idx = sorted_idx[:num_train_samples].numpy() train_set.filtered_samples = [train_set.filtered_samples[ix] for ix in sample_idx]