Skip to content

Commit

Permalink
Download files from "www.statmt.org" sequentially to avoid 503 due to…
Browse files Browse the repository at this point in the history
… rate-limiting.

PiperOrigin-RevId: 434448610
adarob authored and The TensorFlow Datasets Authors committed Mar 14, 2022
1 parent 9df7605 commit bc7882e
Showing 2 changed files with 42 additions and 6 deletions.
16 changes: 14 additions & 2 deletions tensorflow_datasets/translate/wmt.py
Original file line number Diff line number Diff line change
@@ -688,6 +688,7 @@ def _check_manual_files(ds):

manual_paths = {}
urls_to_download = {}
downloaded_files = {}
for ss_name in itertools.chain.from_iterable(self.subsets.values()):
if ss_name == "czeng_17":
# CzEng1.7 is CzEng1.6 with some blocks filtered out. We must download
@@ -698,10 +699,21 @@ def _check_manual_files(ds):
if ds.get_manual_dl_files(source):
manual_paths[ss_name] = _check_manual_files(ds)
else:
urls_to_download[ss_name] = ds.get_url(source)
urls = ds.get_url(source)
# This domain throws a 503 if we attempt to download in parallel.
sequential_dl_urls = [url for url in urls if "www.statmt.org" in url]
parallel_dl_urls = [
url for url in urls if url not in sequential_dl_urls
]
if sequential_dl_urls:
downloaded_files[ss_name] = [
dl_manager.download_and_extract(url) for url in sequential_dl_urls
] + dl_manager.download_and_extract(parallel_dl_urls)
else:
urls_to_download[ss_name] = urls

# Download and extract files from URLs.
downloaded_files = dl_manager.download_and_extract(urls_to_download)
downloaded_files.update(dl_manager.download_and_extract(urls_to_download))
# Extract manually downloaded files.
manual_files = dl_manager.extract(manual_paths)

32 changes: 28 additions & 4 deletions tensorflow_datasets/translate/wmt19_test.py
Original file line number Diff line number Diff line change
@@ -25,15 +25,27 @@ class TranslateDeEnWmt19Test(testing.DatasetBuilderTestCase):
OVERLAPPING_SPLITS = ["validation"]

DL_EXTRACT_RESULT = {
"europarl_v9": ["sentences.de-en.tsv"],
"paracrawl_v3": ["sentences.de-en.tmx"],
"commoncrawl": ["commoncrawl"],
"newscommentary_v14": ["sentences.de-en.tsv"],
"wikititles_v1": ["sentences.de-en.tsv"],
"rapid_2019": ["rapid_2019"],
"newstest2018": ["validation"],
}

SEQUENTUAL_DL_EXTRACT_RESULT = {
"http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz":
"commoncrawl",
"http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz":
"sentences.de-en.tsv",
}

def _get_dl_extract_result(self, url):
if not url:
return []
if isinstance(url, dict):
return super()._get_dl_extract_result(url)
return self.dummy_data / self.SEQUENTUAL_DL_EXTRACT_RESULT[url]

SPLITS = {
"train": 12,
"validation": 2,
@@ -47,14 +59,26 @@ class TranslateCsEnWmt19Test(testing.DatasetBuilderTestCase):

DL_EXTRACT_RESULT = {
"czeng17_filter": ["czeng"],
"europarl_v9": ["sentences.cs-en.tsv"],
"paracrawl_v3": ["sentences.cs-en.tmx"],
"commoncrawl": ["commoncrawl"],
"newscommentary_v14": ["sentences.cs-en.tsv"],
"wikititles_v1": ["sentences.cs-en.tsv"],
"newstest2018": ["validation"],
}

SEQUENTUAL_DL_EXTRACT_RESULT = {
"http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz":
"commoncrawl",
"http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz":
"sentences.cs-en.tsv",
}

def _get_dl_extract_result(self, url):
if not url:
return []
if isinstance(url, dict):
return super()._get_dl_extract_result(url)
return self.dummy_data / self.SEQUENTUAL_DL_EXTRACT_RESULT[url]

SPLITS = {
"train": 13,
"validation": 2,

0 comments on commit bc7882e

Please sign in to comment.