Skip to content

Commit e5738d5

Browse files
committed
Replace dump.sh by dump.py that supports pagination, update docs
1 parent 540523e commit e5738d5

File tree

4 files changed

+140
-28
lines changed

4 files changed

+140
-28
lines changed

README.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,20 @@ While you can perform backup of the Docker volumes,
176176
for larger upgrades of CWL Viewer it is recommended instead to do a JSON dump
177177
and re-load, which will force CWL Viewer to fetch and parse again.
178178

179-
The script `dump.sh` can be used for regular backups, it will store the full
180-
output of /workflows as a timestamped gzip-compressed JSON file:
181-
182-
$ ./dump.sh https://view.commonwl.org/ /var/backups/cwl
183-
/var/backups/cwl/2018-06-06T135133+0000.json.gz
179+
The script `dump.py` can be used for regular backups, it will store the full
180+
output of /workflows as one or multiple timestamped JSON files (you can use
181+
`gzip` to compress them):
182+
183+
$ python dump.py --viewer https://view.commonwl.org/ --output /var/backups --page 0 --size 100
184+
INFO:Viewer URL: https://view.commonwl.org/
185+
INFO:Output: /var/backups
186+
INFO:Dumping workflows from https://view.commonwl.org/, page 0, size 100 to /var/backups
187+
188+
$ python dump.py -o /var/backups -a
189+
INFO:Viewer URL: https://view.commonwl.org/
190+
INFO:Output: /var/backups
191+
INFO:Dumping all the workflows from https://view.commonwl.org/ to /var/backups
192+
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [04:39<00:00, 17.49s/it]
184193

185194
The script `load.py` (requires Python 3) can be used to restore from such JSON dumps:
186195

docs/mongo-to-postgres/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ be used in the command line.
3535
To install the dependencies, use `pip install -r requirements.txt
3636
in a virtual environment to get the dependencies to run both the
3737
Notebook and the Python script.
38+
39+
There is also a replacement for `dump.sh`, the `dump.py` script that can
40+
paginate the requests to retrieve the complete database in a single command.

dump.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import argparse
2+
import logging
3+
from datetime import datetime
4+
from pathlib import Path
5+
from urllib.parse import urljoin
6+
7+
import requests
8+
from math import ceil
9+
from tqdm import tqdm
10+
11+
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
12+
logger = logging.getLogger(__name__)
13+
14+
DEFAULT_PAGE = 0
15+
DEFAULT_SIZE = 10
16+
MAX_PAGE_SIZE = 2000
17+
18+
19+
def _get_total_elements(viewer) -> int:
20+
"""
21+
We need to fetch a workflows listing to figure out how many entries we
22+
have in the database, since the API does not contain a method to count
23+
the DB entries.
24+
25+
:param viewer: CWL Viewer instance URL
26+
:return: number of total elements in the CWL Viewer instance DB
27+
"""
28+
smallest_workflow_dataset: dict = _fetch_workflows_data(viewer, 0, 1).json()
29+
return int(smallest_workflow_dataset['totalElements'])
30+
31+
32+
def _dump_all_workflows(viewer: str, output: Path) -> None:
33+
"""
34+
Dump all the workflows in the database.
35+
:param viewer: CWL Viewer instance URL
36+
:param output: Local existing directory
37+
:return: None
38+
"""
39+
total_elements = _get_total_elements(viewer)
40+
pages = ceil(total_elements / MAX_PAGE_SIZE)
41+
for page in tqdm(range(0, pages)):
42+
_dump_workflows(viewer, output, page, MAX_PAGE_SIZE)
43+
44+
45+
def _dump_workflows(viewer: str, output: Path, page: int, size: int) -> None:
46+
"""
47+
Dump a certain number of workflows.
48+
49+
:param viewer: CWL Viewer instance URL
50+
:param output: Local existing directory
51+
:param page: Page number (first is zero)
52+
:param size: Number of elements to retrieve
53+
:return: None
54+
"""
55+
response = _fetch_workflows_data(viewer, page, size)
56+
file_name = f'{datetime.now().strftime("%Y-%m-%dT%H%M%S%z")}.json'
57+
file_output = output / file_name
58+
logger.debug(f'Dumping page {page}, size {size}, to {file_output}')
59+
with file_output.open('w', encoding='utf-8') as f:
60+
f.write(response.text)
61+
62+
63+
def _fetch_workflows_data(viewer: str, page: int, size: int) -> requests.Response:
64+
"""
65+
Fetch data for workflows. Returned object is the ``requests.Response`` object returned.
66+
67+
This can be turned into JSON with a simple ``response.json()``, or to text via ``.text()``.
68+
:param viewer: CWL Viewer instance URL
69+
:param page: Page number (first is zero)
70+
:param size: Number of elements to retrieve
71+
:return: ``requests.Response`` instance
72+
"""
73+
logger.debug(f'Fetching page {page}, size {size}')
74+
url = urljoin(viewer, f'/workflows?page={page}&size={size}')
75+
logger.debug(f'URL: {url}')
76+
response = requests.get(url, headers={
77+
'accept': 'application/json'
78+
})
79+
return response
80+
81+
82+
def main():
83+
parser = argparse.ArgumentParser()
84+
parser.add_argument("-v", "--viewer", help="server base URL", default='https://view.commonwl.org/')
85+
parser.add_argument("-o", "--output", help="output directory", required=True)
86+
parser.add_argument("-p", "--page", help="what workflows page to retrieve", type=int, default=0)
87+
parser.add_argument("-s", "--size", help="how many workflows to retrieve (capped at 2000)", type=int, default=10)
88+
parser.add_argument("-a", "--all", help="dump all the workflows", action='store_true')
89+
parser.add_argument("-d", "--debug", help="set logging level to debug", action='store_true')
90+
args = parser.parse_args()
91+
if args.all and (args.page > 0 or args.size != 10):
92+
raise ValueError('You must not specify page or size with all.')
93+
if args.page < 0:
94+
raise ValueError('Page must be 0 or greater.')
95+
if args.size < 1:
96+
raise ValueError('Size must be at least 1.')
97+
if args.size > MAX_PAGE_SIZE:
98+
raise ValueError(f'Size must not be greater than {MAX_PAGE_SIZE}')
99+
out_path = Path(args.output)
100+
if not out_path.exists() or not out_path.is_dir():
101+
raise ValueError(f'Invalid output directory (not a directory, or does not exist): {args.output}')
102+
if args.debug:
103+
logger.setLevel(logging.DEBUG)
104+
logger.info(f'Viewer URL: {args.viewer}')
105+
logger.info(f'Output: {args.output}')
106+
if args.all:
107+
logger.info(f'Dumping all the workflows from {args.viewer} to {out_path}')
108+
_dump_all_workflows(
109+
viewer=args.viewer,
110+
output=out_path
111+
)
112+
else:
113+
logger.info(f'Dumping workflows from {args.viewer}, page {args.page}, size {args.size} to {out_path}')
114+
_dump_workflows(
115+
viewer=args.viewer,
116+
output=out_path,
117+
page=args.page,
118+
size=args.size
119+
)
120+
121+
122+
if __name__ == '__main__':
123+
main()

dump.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)