forked from arXiv/arxiv-browse
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_test_article.py
81 lines (58 loc) · 2.9 KB
/
get_test_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""For an arXiv id, gets from the production bucket all related files.
Files such as abs, src and contents of the ps_cache and saves them in the test
directoires.
TODO Sanitizes them of email addresses
"""
import os
import sys
import argparse
from pathlib import Path
from arxiv.files import key_patterns
from arxiv.identifier import Identifier
from google.cloud import storage
def get_article_for_test(bucket, save_base_dir: str, arxiv_id: Identifier):
"""Gets from the production bucket all the files related to an arxiv_id
and saves them in the test directoires"""
abs_current = key_patterns.abs_path_current(arxiv_id)
get_object_for_test(bucket, save_base_dir, abs_current)
other_current = f"{key_patterns.abs_path_current_parent(arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir,other_current)
abs_orig = f"{key_patterns.abs_path_orig_parent(arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir, abs_orig)
ps_cache = f"{key_patterns._ps_cache_part('pdf',arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir, ps_cache)
def get_objs_matching_keyprefix(bucket, save_base_dir:str, key_prefix:str) -> int:
print(f"Trying to get all objects in gs://{bucket.name}/{key_prefix}* to {save_base_dir}/")
blobs = list(bucket.client.list_blobs(bucket, prefix=key_prefix, max_results=100))
count= sum([get_object_for_test(bucket, save_base_dir, blob.name)
for blob in blobs])
print(f"Items in gs://{bucket.name} is {len(blobs)} copied {count}")
def get_object_for_test(bucket, save_base_dir:str, key:str) -> int :
print(f"trying to get gs://{bucket.name}/{key} to {save_base_dir}/{key}")
blob = bucket.blob(key)
if not blob.exists():
raise Exception(f"Object {key} does not exist in bucket")
base = Path(save_base_dir)
target = base / key
if target.exists():
print(f"{key} exists locally, skipping")
return 0
target.parent.mkdir(parents=True, exist_ok=True)
blob.download_to_filename(target)
print(f"Successfully got gs://{bucket.name}/{key} to {save_base_dir}/{key}")
return 1
def sanitize_abs_file(abs_file:Path):
pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__,)
parser.add_argument('id', type=str, help="id of article to get")
parser.add_argument('save_base_dir', type=Path, help="directory to save data in ex ./tests/data/abs_files")
parser.add_argument('bucket', type=str, help="GS bucket name. Do not prefix with gs://")
args = parser.parse_args()
gs_client = storage.Client()
bucket = gs_client.bucket(args.bucket)
if not bucket.exists():
raise Exception(f"GS bucket {bucket} does not exist.")
get_article_for_test(bucket,
args.save_base_dir,
Identifier(args.id))