check min size for visual encoders (#3112)

Chris Elion · web-flow · commit 47625bebff25 · 2019-12-20T16:34:22.000-05:00
* check min size for visual encoders

* friendlier exception

* fix typo
diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py
@@ -11,6 +11,9 @@
 logger = logging.getLogger("mlagents.trainers")
 
 ActivationFunction = Callable[[tf.Tensor], tf.Tensor]
+EncoderFunction = Callable[
+    [tf.Tensor, int, ActivationFunction, int, str, bool], tf.Tensor
+]
 
 EPSILON = 1e-7
 
@@ -26,9 +29,17 @@ class LearningRateSchedule(Enum):
     LINEAR = "linear"
 
 
-class LearningModel(object):
+class LearningModel:
     _version_number_ = 2
 
+    # Minimum supported side for each encoder type. If refactoring an encoder, please
+    # adjust these also.
+    MIN_RESOLUTION_FOR_ENCODER = {
+        EncoderType.SIMPLE: 20,
+        EncoderType.NATURE_CNN: 36,
+        EncoderType.RESNET: 15,
+    }
+
     def __init__(
         self, m_size, normalize, use_recurrent, brain, seed, stream_names=None
     ):
@@ -427,6 +438,17 @@ def create_resnet_visual_observation_encoder(
             )
         return hidden_flat
 
+    @staticmethod
+    def get_encoder_for_type(encoder_type: EncoderType) -> EncoderFunction:
+        ENCODER_FUNCTION_BY_TYPE = {
+            EncoderType.SIMPLE: LearningModel.create_visual_observation_encoder,
+            EncoderType.NATURE_CNN: LearningModel.create_nature_cnn_visual_observation_encoder,
+            EncoderType.RESNET: LearningModel.create_resnet_visual_observation_encoder,
+        }
+        return ENCODER_FUNCTION_BY_TYPE.get(
+            encoder_type, LearningModel.create_visual_observation_encoder
+        )
+
     @staticmethod
     def create_discrete_action_masking_layer(all_logits, action_masks, action_size):
         """
@@ -474,6 +496,17 @@ def create_discrete_action_masking_layer(all_logits, action_masks, action_size):
             ),
         )
 
+    @staticmethod
+    def _check_resolution_for_encoder(
+        camera_res: CameraResolution, vis_encoder_type: EncoderType
+    ) -> None:
+        min_res = LearningModel.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
+        if camera_res.height < min_res or camera_res.width < min_res:
+            raise UnityTrainerException(
+                f"Visual observation resolution ({camera_res.width}x{camera_res.height}) is too small for"
+                f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}"
+            )
+
     def create_observation_streams(
         self,
         num_streams: int,
@@ -496,23 +529,20 @@ def create_observation_streams(
 
         self.visual_in = []
         for i in range(brain.number_visual_observations):
+            LearningModel._check_resolution_for_encoder(
+                brain.camera_resolutions[i], vis_encode_type
+            )
             visual_input = self.create_visual_input(
                 brain.camera_resolutions[i], name="visual_observation_" + str(i)
             )
             self.visual_in.append(visual_input)
         vector_observation_input = self.create_vector_input()
 
-        # Pick the encoder function based on the EncoderType
-        create_encoder_func = LearningModel.create_visual_observation_encoder
-        if vis_encode_type == EncoderType.RESNET:
-            create_encoder_func = LearningModel.create_resnet_visual_observation_encoder
-        elif vis_encode_type == EncoderType.NATURE_CNN:
-            create_encoder_func = (
-                LearningModel.create_nature_cnn_visual_observation_encoder
-            )
-
         final_hiddens = []
         for i in range(num_streams):
+            # Pick the encoder function based on the EncoderType
+            create_encoder_func = LearningModel.get_encoder_for_type(vis_encode_type)
+
             visual_encoders = []
             hidden_state, hidden_visual = None, None
             _scope_add = stream_scopes[i] if stream_scopes else ""
@@ -523,8 +553,8 @@ def create_observation_streams(
                         h_size,
                         activation_fn,
                         num_layers,
-                        scope=f"{_scope_add}main_graph_{i}_encoder{j}",
-                        reuse=False,
+                        f"{_scope_add}main_graph_{i}_encoder{j}",  # scope
+                        False,  # reuse
                     )
                     visual_encoders.append(encoded_visual)
                 hidden_visual = tf.concat(visual_encoders, axis=1)
diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py
@@ -9,7 +9,9 @@
 from mlagents.trainers.ppo.models import PPOModel
 from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
 from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.models import EncoderType, LearningModel
+from mlagents.trainers.trainer import UnityTrainerException
+from mlagents.trainers.brain import BrainParameters, CameraResolution
 from mlagents_envs.environment import UnityEnvironment
 from mlagents_envs.mock_communicator import MockCommunicator
 from mlagents.trainers.tests import mock_brain as mb
@@ -499,5 +501,41 @@ def test_normalization(dummy_config):
     assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
 
 
+def test_min_visual_size():
+    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
+    assert set(LearningModel.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
+
+    for encoder_type in EncoderType:
+        with tf.Graph().as_default():
+            good_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
+            good_res = CameraResolution(
+                width=good_size, height=good_size, num_channels=3
+            )
+            LearningModel._check_resolution_for_encoder(good_res, encoder_type)
+            vis_input = LearningModel.create_visual_input(
+                good_res, "test_min_visual_size"
+            )
+            enc_func = LearningModel.get_encoder_for_type(encoder_type)
+            enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)
+
+        # Anything under the min size should raise an exception. If not, decrease the min size!
+        with pytest.raises(Exception):
+            with tf.Graph().as_default():
+                bad_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
+                bad_res = CameraResolution(
+                    width=bad_size, height=bad_size, num_channels=3
+                )
+
+                with pytest.raises(UnityTrainerException):
+                    # Make sure we'd hit a friendly error during model setup time.
+                    LearningModel._check_resolution_for_encoder(bad_res, encoder_type)
+
+                vis_input = LearningModel.create_visual_input(
+                    bad_res, "test_min_visual_size"
+                )
+                enc_func = LearningModel.get_encoder_for_type(encoder_type)
+                enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)
+
+
 if __name__ == "__main__":
     pytest.main()