Use external entry_point for run_experiment_cloud instead of loading script.

juanuribe28 · Tensorflow Cloud maintainers · commit b6de19357070 · 2021-08-06T13:23:20.000-07:00
PiperOrigin-RevId: 389246791
diff --git a/src/python/dependencies.py b/src/python/dependencies.py
@@ -27,6 +27,8 @@ def make_required_install_packages():
         "tensorflow>=1.15.0,<3.0",
         "tensorflow_datasets",
         "tensorflow_transform",
+        "tf-models-official",
+        "importlib_resources ; python_version<'3.7'"
     ]
 
 
@@ -38,4 +40,5 @@ def make_required_test_packages():
         "numpy",
         "nbconvert",
         "tf-models-official",
+        "importlib_resources ; python_version<'3.7'"
     ]
diff --git a/src/python/tensorflow_cloud/core/containerize.py b/src/python/tensorflow_cloud/core/containerize.py
@@ -285,7 +285,7 @@ def _get_file_path_map(self):
             self.entry_point = sys.argv[0]
 
         # Map entry_point directory to the dst directory.
-        if not self.called_from_notebook:
+        if not self.called_from_notebook or self.entry_point is not None:
             entry_point_dir, _ = os.path.split(self.entry_point)
             if not entry_point_dir:  # Current directory
                 entry_point_dir = "."
diff --git a/src/python/tensorflow_cloud/core/experimental/constants.py b/src/python/tensorflow_cloud/core/experimental/constants.py
@@ -0,0 +1,17 @@
+# Lint as: python3
+# Copyright 2021 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""""Module that contains some constantsused by the experimental module."""
+
+PARAMS_FILE_NAME_FORMAT = '{}_params'
diff --git a/src/python/tensorflow_cloud/core/experimental/models.py b/src/python/tensorflow_cloud/core/experimental/models.py
@@ -15,17 +15,28 @@
 """Module that contains the `run_models` wrapper for training models from TF Model Garden."""
 
 import os
+import pickle
+import shutil
 from typing import Any, Dict, Optional
+import uuid
 
+from . import constants
 from .. import machine_config
 from .. import run
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
-from official.core import train_lib
 from official.vision.image_classification.efficientnet import efficientnet_model
 from official.vision.image_classification.resnet import resnet_model
 
+# pylint: disable=g-import-not-at-top
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Backported for python<3.7
+    import importlib_resources as pkg_resources
+# pylint: enable=g-import-not-at-top
+
 
 def run_models(dataset_name: str,
                model_name: str,
@@ -239,7 +250,7 @@ def run_experiment_cloud(run_experiment_kwargs: Dict[str, Any],
         run_experiment_kwargs: keyword arguments for `train_lib.run_experiment`.
         The docs can be found at
         https://github.com/tensorflow/models/blob/master/official/core/train_lib.py
-        The distribution_strategy param is ignored because the distirbution
+        The distribution_strategy param is ignored because the distribution
         strategy is selected based on run_kwargs.
         run_kwargs: keyword arguments for `tfc.run`. The docs can be found at
         https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/run.py
@@ -251,48 +262,42 @@ def run_experiment_cloud(run_experiment_kwargs: Dict[str, Any],
     """
     if run_kwargs is None:
         run_kwargs = dict()
-
-    if run.remote():
-        default_machine_config = machine_config.COMMON_MACHINE_CONFIGS['T4_1X']
-        if 'chief_config' in run_kwargs:
-            chief_config = run_kwargs['chief_config']
-        else:
-            chief_config = default_machine_config
-        if 'worker_count' in run_kwargs:
-            worker_count = run_kwargs['worker_count']
+    distribution_strategy = get_distribution_strategy_str(run_kwargs)
+    run_experiment_kwargs.update(
+        dict(distribution_strategy=distribution_strategy))
+    file_id = str(uuid.uuid4())
+    params_file = save_params(run_experiment_kwargs, file_id)
+
+    with pkg_resources.path(__package__, 'models_entry_point.py') as path:
+        entry_point = f'{file_id}.py'
+        shutil.copyfile(str(path), entry_point)
+        run_kwargs.update(dict(entry_point=entry_point,
+                               distribution_strategy=None))
+        info = run.run(**run_kwargs)
+    os.remove(entry_point)
+    os.remove(params_file)
+    return info
+
+
+def get_distribution_strategy_str(run_kwargs):
+    """Gets the name of a distribution strategy based on cloud run config."""
+    if ('worker_count' in run_kwargs
+        and run_kwargs['worker_count'] > 0):
+        if ('worker_config' in run_kwargs
+            and machine_config.is_tpu_config(run_kwargs['worker_config'])):
+            return 'tpu'
         else:
-            worker_count = 0
-        if 'worker_config' in run_kwargs:
-            worker_config = run_kwargs['worker_config']
-        else:
-            worker_config = default_machine_config
-        distribution_strategy = get_distribution_strategy(chief_config,
-                                                          worker_count,
-                                                          worker_config)
-        run_experiment_kwargs.update(
-            dict(distribution_strategy=distribution_strategy))
-        model, _ = train_lib.run_experiment(**run_experiment_kwargs)
-        model.save(run_experiment_kwargs['model_dir'])
-
-    run_kwargs.update(dict(entry_point=None,
-                           distribution_strategy=None))
-    return run.run(**run_kwargs)
-
-
-def get_distribution_strategy(chief_config, worker_count, worker_config):
-    """Gets a tf distribution strategy based on the cloud run config."""
-    if worker_count > 0:
-        if machine_config.is_tpu_config(worker_config):
-            # TODO(b/194857231) Dependency conflict for using TPUs
-            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-                tpu='local')
-            tf.config.experimental_connect_to_cluster(resolver)
-            tf.tpu.experimental.initialize_tpu_system(resolver)
-            return tf.distribute.TPUStrategy(resolver)
-        else:
-            # TODO(b/148619319) Saving model currently failing
-            return tf.distribute.MultiWorkerMirroredStrategy()
-    elif chief_config.accelerator_count > 1:
-        return tf.distribute.MirroredStrategy()
+            return 'multi_mirror'
+    elif ('chief_config' in run_kwargs
+          and run_kwargs['chief_config'].accelerator_count > 1):
+        return 'mirror'
     else:
-        return tf.distribute.OneDeviceStrategy(device='/gpu:0')
+        return 'one_device'
+
+
+def save_params(params, file_id):
+    """Pickles the params object using the file_id as prefix."""
+    file_name = constants.PARAMS_FILE_NAME_FORMAT.format(file_id)
+    with open(file_name, 'xb') as f:
+        pickle.dump(params, f)
+    return file_name
diff --git a/src/python/tensorflow_cloud/core/experimental/models_entry_point.py b/src/python/tensorflow_cloud/core/experimental/models_entry_point.py
@@ -0,0 +1,65 @@
+# Lint as: python3
+# Copyright 2021 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Entry point file for run_experiment_cloud."""
+
+import os
+import pickle
+
+import tensorflow as tf
+
+from tensorflow_cloud.core.experimental import constants
+from official.core import train_lib
+
+
+def load_params(file_name):
+    with open(file_name, 'rb') as f:
+        params = pickle.load(f)
+    return params
+
+
+def get_tpu_strategy():
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu='local')
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.TPUStrategy(resolver)
+
+
+def get_one_device():
+    return tf.distribute.OneDeviceStrategy(device='/gpu:0')
+
+_DISTRIBUTION_STRATEGIES = dict(
+        # TODO(b/194857231) Dependency conflict for using TPUs
+        tpu=get_tpu_strategy,
+        # TODO(b/148619319) Saving model currently failing for multi_mirror
+        multi_mirror=tf.distribute.MultiWorkerMirroredStrategy,
+        mirror=tf.distribute.MirroredStrategy,
+        one_device=get_one_device)
+
+
+def main():
+    prefix, _ = os.path.splitext(os.path.basename(__file__))
+    file_name = constants.PARAMS_FILE_NAME_FORMAT.format(prefix)
+    run_experiment_kwargs = load_params(file_name)
+    strategy_str = run_experiment_kwargs['distribution_strategy']
+    strategy = _DISTRIBUTION_STRATEGIES[strategy_str]()
+    run_experiment_kwargs.update(dict(
+        distribution_strategy=strategy))
+    model, _ = train_lib.run_experiment(**run_experiment_kwargs)
+    model.save(run_experiment_kwargs['model_dir'])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/unit/models_entry_point_test.py b/src/python/tensorflow_cloud/core/experimental/tests/unit/models_entry_point_test.py
@@ -0,0 +1,77 @@
+# Lint as: python3
+# Copyright 2021 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the models experimental module."""
+
+from absl.testing import absltest
+import mock
+import tensorflow as tf
+
+from tensorflow_cloud.core.experimental import constants
+from tensorflow_cloud.core.experimental import models_entry_point
+from official.core import base_task
+from official.core import config_definitions
+from official.core import train_lib
+
+
+class ModelsTest(absltest.TestCase):
+
+    def setUp(self):
+        super(ModelsTest, self).setUp()
+        config = mock.MagicMock(spec=config_definitions.ExperimentConfig)
+        task = mock.MagicMock(spec=base_task.Task)
+        self.run_experiment_kwargs = dict(task=task,
+                                          mode='train_and_eval',
+                                          params=config,
+                                          model_dir='model_path',
+                                          distribution_strategy='one_device')
+        self.load_params = mock.patch.object(
+            models_entry_point,
+            'load_params',
+            autospec=True,
+            return_value=self.run_experiment_kwargs,
+        ).start()
+
+        self.strategy = mock.patch.object(
+            tf.distribute,
+            'OneDeviceStrategy',
+            autospec=True,
+            return_value='one_device_strategy',
+        ).start()
+
+        self.model = mock.MagicMock()
+        self.run_experiment = mock.patch.object(
+            train_lib,
+            'run_experiment',
+            autospec=True,
+            return_value=(self.model, {})
+        ).start()
+
+    def tearDown(self):
+        mock.patch.stopall()
+        super(ModelsTest, self).tearDown()
+
+    def test_main(self):
+        models_entry_point.main()
+        file_name = constants.PARAMS_FILE_NAME_FORMAT.format(
+            'models_entry_point')
+        self.load_params.assert_called_with(file_name)
+        self.run_experiment_kwargs.update(dict(
+            distribution_strategy='one_device_strategy'))
+        self.run_experiment.assert_called_with(**self.run_experiment_kwargs)
+        self.model.save.assert_called_with(
+            self.run_experiment_kwargs['model_dir'])
+
+if __name__ == '__main__':
+    absltest.main()
diff --git a/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py b/src/python/tensorflow_cloud/core/experimental/tests/unit/models_test.py

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@ def make_required_install_packages():`
`27`	`27`	`"tensorflow>=1.15.0,<3.0",`
`28`	`28`	`"tensorflow_datasets",`
`29`	`29`	`"tensorflow_transform",`
	`30`	`+ "tf-models-official",`
	`31`	`+ "importlib_resources ; python_version<'3.7'"`
`30`	`32`	`]`
`31`	`33`
`32`	`34`
`@@ -38,4 +40,5 @@ def make_required_test_packages():`
`38`	`40`	`"numpy",`
`39`	`41`	`"nbconvert",`
`40`	`42`	`"tf-models-official",`
	`43`	`+ "importlib_resources ; python_version<'3.7'"`
`41`	`44`	`]`