def train(batch_size):
import tensorflow as tf
import numpy as np
import uuid
BUFFER_SIZE = 10000
def make_datasets(batch_size):
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
# The `x` arrays are in uint8 and have values in the [0, 255] range.
# You need to convert them to float32 with values in the [0, 1] range.
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices(
(x_train, y_train)).shuffle(BUFFER_SIZE).repeat(2).batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(128, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(256, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(512, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=['accuracy'],
)
return model
train_datasets = make_datasets(batch_size)
multi_worker_model = build_and_compile_cnn_model()
# Specify the data auto-shard policy: DATA
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = \
tf.data.experimental.AutoShardPolicy.DATA
train_datasets = train_datasets.with_options(options)
multi_worker_model.fit(x=train_datasets, epochs=3)
Distributed Training with TensorFlow 2
tensorflow.distribute.Strategy is a TensorFlow API to distribute training across multiple GPUs or multiple machines. The
spark-tensorflow-distributor
package helps you launch distributed training tasks using a Spark job in barrier mode. Users only need to provide atrain()
function that runs the single-node training code on a GPU or worker node and the package handles all the configurations for you.This notebook demonstrates how to use MirroredStrategyRunner in the
spark-tensorflow-distributor
package to do distributed training. It also shows how to use your own custom strategy. The example is adapted from https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras.Requirements