-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdownload_data.py
52 lines (43 loc) · 2.18 KB
/
download_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
import gzip
import shutil
import os
import pandas as pd
import argparse
from tqdm import tqdm
from utils.constants import AMAZON_CATEGORIES, DATA_PATH
def fetch_or_resume(url, out):
with open(out, 'ab') as f:
headers = {}
pos = f.tell()
if pos:
headers['Range'] = f'bytes={pos}-'
response = requests.get(url, headers=headers, stream=True)
total_size = int(response.headers.get('content-length'))
for data in tqdm(iterable=response.iter_content(chunk_size=1024), total=total_size//1024, unit='KB'):
f.write(data)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', '-d', type=str, default='amazon', help='name of dataset')
args, _ = parser.parse_known_args()
if args.dataset.lower() == 'amazon':
# Old URL: wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games.json.gz
base_url = 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/'
prefix = ''
suffix_url = '_5.json.gz'
output_dir = os.path.join(DATA_PATH, args.dataset.lower(), 'compressed')
for cat in AMAZON_CATEGORIES:
filename = prefix + cat.replace(' ', '_') + suffix_url
if not os.path.exists(os.path.join(output_dir, filename)):
print(f'\nDownloading {cat} subset from Amazon dataset...')
os.system(f'wget {base_url + filename} -P {output_dir} --no-check-certificate')
# cURL alternative
# os.system(f'curl {base_url + filename} -o {output_dir}/{filename}')
# if not os.path.exists(os.path.join(output_dir, filename)):
# with gzip.open(os.path.join(output_dir, filename), 'rb') as f_in:
# with open(os.path.join(output_dir, cat.replace(' ', '-') + 'pkl'), 'wb') as f_out:
# shutil.copyfileobj(f_in, f_out)
else:
print(f'\n{cat} subset from Amazon dataset already exists...')
else:
raise NotImplementedError('The download of the provided dataset is not supported yet')