Skip to content

Commit

Permalink
Initial renames for TF 2.0 compatibility
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 230017293
  • Loading branch information
Ryan Sepassi authored and Copybara-Service committed Jan 19, 2019
1 parent 54f5b66 commit 107d343
Show file tree
Hide file tree
Showing 56 changed files with 283 additions and 258 deletions.
7 changes: 0 additions & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@ python:
- "2.7"
- "3.6"
env:
global:
- TF_LATEST="1.10.*"
matrix:
# We test against recent versions of TensorFlow and tf-nightly.
# If updating, also update TF_LATEST above
# TODO(rsepassi): Enable recent versions on release
# - TF_VERSION="1.9.*"
# - TF_VERSION="1.10.*"
- TF_VERSION="tf-nightly"
install:
- ./oss_scripts/oss_pip_install.sh
Expand Down
11 changes: 6 additions & 5 deletions docs/add_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ then this document is for you.
* [Manual download / extraction](#manual-download-extraction)
* [Specifying how the data should be split](#specifying-how-the-data-should-be-split)
* [Reading downloaded data and generating serialized dataset](#reading-downloaded-data-and-generating-serialized-dataset)
* [File access and tf.gfile](#file-access-and-tfgfile)
* [File access and tf.io.gfile](#file-access-and-tfgfile)
* [Dataset configuration](#dataset-configuration)
* [Create your own FeatureConnector](#create-your-own-featureconnector)
* [Adding the dataset to `tensorflow/datasets`](#adding-the-dataset-to-tensorflowdatasets)
Expand Down Expand Up @@ -230,11 +230,12 @@ jpeg content into the TFRecord file automatically.

If you've implemented the test harness, your builder test should now pass.

### File access and `tf.gfile`
### File access and `tf.io.gfile`

In order to support Cloud storage systems, all file access must use `tf.gfile`
or other TensorFlow file APIs (for example, `tf.python_io`). Python built-ins
for file operations (e.g. `open`, `os.rename`, `gzip`, etc.) must be avoided.
In order to support Cloud storage systems, all file access must use
`tf.io.gfile` or other TensorFlow file APIs (for example, `tf.python_io`).
Python built-ins for file operations (e.g. `open`, `os.rename`, `gzip`, etc.)
must be avoided.

## Dataset configuration

Expand Down
15 changes: 11 additions & 4 deletions oss_scripts/oss_pip_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@ set -e # fail and exit on any command erroring
: "${TF_VERSION:?}"

# Install ffmpeg for Audio FeatureConnector tests
sudo add-apt-repository -y ppa:mc3man/trusty-media
sudo apt-get -qq update
sudo apt-get install -y ffmpeg
FFMPEG=$(command -v ffmpeg)
if [[ -z "$FFMPEG" ]]
then
sudo add-apt-repository -y ppa:mc3man/trusty-media
sudo apt-get -qq update
sudo apt-get install -y ffmpeg
fi

if [[ "$TF_VERSION" == "tf-nightly" ]]
then
pip install tf-nightly;
pip install -q tf-nightly;
elif [[ "$TF_VERSION" == "tf2" ]]
then
pip install -q "tf-nightly-2.0-preview"
else
pip install -q "tensorflow==$TF_VERSION"
fi
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
DOCLINES = __doc__.split('\n')

REQUIRED_PKGS = [
'absl-py',
'future',
'promise',
'protobuf>=3.6.1',
Expand All @@ -47,7 +48,6 @@
]

TESTS_REQUIRE = [
'absl-py',
'jupyter',
'pytest',
]
Expand Down Expand Up @@ -77,7 +77,7 @@
]

DATASET_EXTRAS = {
'librispeech': ['pydub'],
'librispeech': ['pydub'], # and ffmpeg installed
}

all_dataset_extras = []
Expand Down
4 changes: 2 additions & 2 deletions tensorflow_datasets/audio/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def _generate_examples(self, dirs):
def _walk_librispeech_dir(directory):
"""Walk a Librispeech directory and yield examples."""
directory = os.path.join(directory, "LibriSpeech")
for path, _, files in tf.gfile.Walk(directory):
for path, _, files in tf.io.gfile.walk(directory):
if not files:
continue

Expand All @@ -244,7 +244,7 @@ def _walk_librispeech_dir(directory):
assert len(transcript_file) == 1
transcript_file, = transcript_file
transcripts = {}
with tf.gfile.Open(os.path.join(path, transcript_file)) as f:
with tf.io.gfile.GFile(os.path.join(path, transcript_file)) as f:
for line in f:
line = line.strip()
key, transcript = line.split(" ", 1)
Expand Down
39 changes: 20 additions & 19 deletions tensorflow_datasets/core/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
import sys

from absl import logging
import six
import tensorflow as tf

Expand Down Expand Up @@ -108,7 +109,7 @@ class DatasetBuilder(object):
# And then the rest of your input pipeline
train_dataset = train_dataset.repeat().shuffle(1024).batch(128)
train_dataset = train_dataset.prefetch(2)
features = train_dataset.make_one_shot_iterator().get_next()
features = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next()
image, label = features['image'], features['label']
```
"""
Expand Down Expand Up @@ -160,7 +161,7 @@ def __init__(self, data_dir=None, config=None):
self._data_dir = self._build_data_dir()

# Use data version (restored from disk)
if tf.gfile.Exists(self._data_dir):
if tf.io.gfile.exists(self._data_dir):
# Overwrite the current dataset info with the restored data version.
self.info.read_from_directory(self._data_dir)

Expand Down Expand Up @@ -197,10 +198,10 @@ def download_and_prepare(self, download_dir=None, download_config=None):
"""

download_config = download_config or download.DownloadConfig()
data_exists = tf.gfile.Exists(self._data_dir)
data_exists = tf.io.gfile.exists(self._data_dir)
if (data_exists and
download_config.download_mode == REUSE_DATASET_IF_EXISTS):
tf.logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
return

dl_manager = self._make_download_manager(
Expand All @@ -216,7 +217,7 @@ def download_and_prepare(self, download_dir=None, download_config=None):
"the same version {} already exists. If the dataset has changed, "
"please update the version number.".format(self.name, self._data_dir,
self.info.version))
tf.logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
self._log_download_bytes()

# Create a tmp dir and rename to self._data_dir on successful exit.
Expand All @@ -237,8 +238,8 @@ def download_and_prepare(self, download_dir=None, download_config=None):
if download_config.compute_stats:
already_has_stats = bool(self.info.num_examples)
if already_has_stats:
tf.logging.info("Skipping computing stats because they are already "
"populated.")
logging.info("Skipping computing stats because they are already "
"populated.")
else:
self.info.compute_dynamic_properties()

Expand Down Expand Up @@ -286,7 +287,7 @@ def as_dataset(self,
If `batch_size` is -1, will return feature dictionaries containing
the entire dataset in `tf.Tensor`s instead of a `tf.data.Dataset`.
"""
if not tf.gfile.Exists(self._data_dir):
if not tf.io.gfile.exists(self._data_dir):
raise AssertionError(
("Dataset %s: could not find data in %s. Please make sure to call "
"dataset_builder.download_and_prepare(), or pass download=True to "
Expand Down Expand Up @@ -365,11 +366,11 @@ def _build_data_dir(self):

def _other_versions_on_disk():
"""Returns previous versions on disk."""
if not tf.gfile.Exists(builder_data_dir):
if not tf.io.gfile.exists(builder_data_dir):
return []

version_dirnames = []
for dir_name in tf.gfile.ListDirectory(builder_data_dir):
for dir_name in tf.io.gfile.listdir(builder_data_dir):
try:
version_dirnames.append((utils.Version(dir_name), dir_name))
except ValueError: # Invalid version (ex: incomplete data dir)
Expand All @@ -382,14 +383,15 @@ def _other_versions_on_disk():
if version_dirs:
other_version = version_dirs[0][0]
if other_version != self._version:
tf.logging.warn(
warn_msg = (
"Found a different version {other_version} of dataset {name} in "
"data_dir {data_dir}. Using currently defined version "
"{cur_version}.".format(
other_version=str(other_version),
name=self.name,
data_dir=self._data_dir_root,
cur_version=str(self._version)))
logging.warn(warn_msg)

return version_data_dir

Expand Down Expand Up @@ -479,8 +481,8 @@ def _create_builder_config(self, builder_config):
"""Create and validate BuilderConfig object."""
if builder_config is None and self.BUILDER_CONFIGS:
builder_config = self.BUILDER_CONFIGS[0]
tf.logging.info("No config specified, defaulting to first: %s/%s",
self.name, builder_config.name)
logging.info("No config specified, defaulting to first: %s/%s", self.name,
builder_config.name)
if not builder_config:
return
if isinstance(builder_config, six.string_types):
Expand All @@ -494,7 +496,7 @@ def _create_builder_config(self, builder_config):
raise ValueError("BuilderConfig must have a name, got %s" % name)
is_custom = name not in self.builder_configs
if is_custom:
tf.logging.warning("Using custom data configuration %s", name)
logging.warning("Using custom data configuration %s", name)
else:
if builder_config is not self.builder_configs[name]:
raise ValueError(
Expand Down Expand Up @@ -639,10 +641,9 @@ def _generate_examples(self, **kwargs):

def _download_and_prepare(self, dl_manager, max_examples_per_split=None):
if max_examples_per_split is not None:
tf.logging.warn("Splits capped at %s examples max.",
max_examples_per_split)
if not tf.gfile.Exists(self._data_dir):
tf.gfile.MakeDirs(self._data_dir)
logging.warn("Splits capped at %s examples max.", max_examples_per_split)
if not tf.io.gfile.exists(self._data_dir):
tf.io.gfile.makedirs(self._data_dir)

# Generate the filenames and write the example on disk
def make_generator_fn(**kwargs):
Expand Down Expand Up @@ -671,7 +672,7 @@ def generator_fn():
"._split_generator()."
)

tf.logging.info("Generating split %s", s.name)
logging.info("Generating split %s", s.name)
split_dict.add(s)

output_files = self._build_split_filenames(
Expand Down
21 changes: 11 additions & 10 deletions tensorflow_datasets/core/dataset_builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import os

from absl import logging
from absl.testing import parameterized
import tensorflow as tf
from tensorflow_datasets.core import dataset_builder
Expand Down Expand Up @@ -103,7 +104,7 @@ def test_shared_generator(self):

written_filepaths = [
os.path.join(builder._data_dir, fname)
for fname in tf.gfile.ListDirectory(builder._data_dir)
for fname in tf.io.gfile.listdir(builder._data_dir)
]
# The data_dir contains the cached directory by default
expected_filepaths = builder._build_split_filenames(
Expand Down Expand Up @@ -154,10 +155,10 @@ def test_build_data_dir(self):
version_dir = os.path.join(builder_data_dir, "1.0.0")

# The dataset folder contains multiple other versions
tf.gfile.MakeDirs(os.path.join(builder_data_dir, "14.0.0.invalid"))
tf.gfile.MakeDirs(os.path.join(builder_data_dir, "10.0.0"))
tf.gfile.MakeDirs(os.path.join(builder_data_dir, "9.0.0"))
tf.gfile.MakeDirs(os.path.join(builder_data_dir, "0.1.0"))
tf.io.gfile.makedirs(os.path.join(builder_data_dir, "14.0.0.invalid"))
tf.io.gfile.makedirs(os.path.join(builder_data_dir, "10.0.0"))
tf.io.gfile.makedirs(os.path.join(builder_data_dir, "9.0.0"))
tf.io.gfile.makedirs(os.path.join(builder_data_dir, "0.1.0"))

# The builder's version dir is chosen
self.assertEqual(builder._build_data_dir(), version_dir)
Expand All @@ -170,7 +171,7 @@ def test_get_data_dir_with_config(self):
builder_data_dir = os.path.join(tmp_dir, builder.name, config_name)
version_data_dir = os.path.join(builder_data_dir, "0.0.1")

tf.gfile.MakeDirs(version_data_dir)
tf.io.gfile.makedirs(version_data_dir)
self.assertEqual(builder._build_data_dir(), version_data_dir)

def test_config_construction(self):
Expand Down Expand Up @@ -201,11 +202,11 @@ def test_with_configs(self):
data_dir1 = os.path.join(tmp_dir, builder1.name, "plus1", "0.0.1")
data_dir2 = os.path.join(tmp_dir, builder2.name, "plus2", "0.0.2")
# Test that subdirectories were created per config
self.assertTrue(tf.gfile.Exists(data_dir1))
self.assertTrue(tf.gfile.Exists(data_dir2))
self.assertTrue(tf.io.gfile.exists(data_dir1))
self.assertTrue(tf.io.gfile.exists(data_dir2))
# 2 train shards, 1 test shard, plus metadata files
self.assertGreater(len(tf.gfile.ListDirectory(data_dir1)), 3)
self.assertGreater(len(tf.gfile.ListDirectory(data_dir2)), 3)
self.assertGreater(len(tf.io.gfile.listdir(data_dir1)), 3)
self.assertGreater(len(tf.io.gfile.listdir(data_dir2)), 3)

# Test that the config was used and they didn't collide.
splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST]
Expand Down
23 changes: 12 additions & 11 deletions tensorflow_datasets/core/dataset_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import os
import pprint

from absl import logging
import numpy as np
import tensorflow as tf

Expand Down Expand Up @@ -178,8 +179,8 @@ def supervised_keys(self):
def splits(self):
if not self._fully_initialized:
# TODO(epot): Consider raising an error here instead?
tf.logging.info("`splits` hasn't been fully initialized, statistics maybe"
" missing.")
logging.info("`splits` hasn't been fully initialized, statistics maybe"
" missing.")
return self._splits.copy()

@splits.setter
Expand Down Expand Up @@ -246,11 +247,10 @@ def _compute_dynamic_properties(self, builder):
except tf.errors.InvalidArgumentError:
# This means there is no such split, even though it was specified in the
# info, the least we can do is to log this.
tf.logging.error((
"%s's info() property specifies split %s, but it "
"doesn't seem to have been generated. Please ensure "
"that the data was downloaded for this split and re-run "
"download_and_prepare."), self.name, split_name)
logging.error(("%s's info() property specifies split %s, but it "
"doesn't seem to have been generated. Please ensure "
"that the data was downloaded for this split and re-run "
"download_and_prepare."), self.name, split_name)
raise

# Set splits to trigger proto update in setter
Expand All @@ -275,7 +275,8 @@ def write_to_directory(self, dataset_info_dir):
if self.features:
self.features.save_metadata(dataset_info_dir)

with tf.gfile.Open(self._dataset_info_filename(dataset_info_dir), "w") as f:
with tf.io.gfile.GFile(self._dataset_info_filename(dataset_info_dir),
"w") as f:
f.write(self.as_json)

def read_from_directory(self, dataset_info_dir, from_packaged_data=False):
Expand All @@ -302,10 +303,10 @@ def read_from_directory(self, dataset_info_dir, from_packaged_data=False):
json_filename = self._dataset_info_filename(dataset_info_dir)

# Load the metadata from disk
if not tf.gfile.Exists(json_filename):
if not tf.io.gfile.exists(json_filename):
return False

with tf.gfile.Open(json_filename, "r") as f:
with tf.io.gfile.GFile(json_filename, "r") as f:
dataset_info_json_str = f.read()

# Parse it back into a proto.
Expand Down Expand Up @@ -486,7 +487,7 @@ def get_dataset_feature_statistics(builder, split):
# proto has no support for it.
maybe_feature_shape = output_shapes_dict[feature_name]
if not isinstance(maybe_feature_shape, tf.TensorShape):
tf.logging.error(
logging.error(
"Statistics generation doesn't work for nested structures yet")
continue

Expand Down
4 changes: 2 additions & 2 deletions tensorflow_datasets/core/dataset_info_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ def test_writing(self):
info.read_from_directory(_INFO_DIR)

# Read the json file into a string.
with tf.gfile.Open(info._dataset_info_filename(_INFO_DIR)) as f:
with tf.io.gfile.GFile(info._dataset_info_filename(_INFO_DIR)) as f:
existing_json = json.load(f)

# Now write to a temp directory.
with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
info.write_to_directory(tmp_dir)

# Read the newly written json file into a string.
with tf.gfile.Open(info._dataset_info_filename(tmp_dir)) as f:
with tf.io.gfile.GFile(info._dataset_info_filename(tmp_dir)) as f:
new_json = json.load(f)

# Assert what was read and then written and read again is the same.
Expand Down
Loading

0 comments on commit 107d343

Please sign in to comment.