from tensorflow.keras.datasets.mnist import load_data as mnist_load_data
from tensorflow.keras.utils import to_categorical as keras_to_categorical
def get_dataset(num_classes, rank=0, size=1):
(x_train, y_train), (x_test, y_test) = mnist_load_data('MNIST-data-%d' % rank)
x_train = x_train[rank::size]
y_train = y_train[rank::size]
x_test = x_test[rank::size]
y_test = y_test[rank::size]
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
y_train = keras_to_categorical(y_train, num_classes)
y_test = keras_to_categorical(y_test, num_classes)
return (x_train, y_train), (x_test, y_test)
/databricks/python/lib/python3.7/site-packages/tensorflow/python/pywrap_tensorflow_internal.py:15: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
import imp
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Dropout
def get_model(num_classes):
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=(28, 28, 1)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
return model
# Specify training parameters
epochs = 2
num_classes = 10
def train_hvd(learning_rate, batch_size, checkpoint_dir):
"""
This function is passed to Horovod and executed on each worker.
Pass in the hyperparameters we will tune with Hyperopt.
"""
# Import tensorflow modules to each worker
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow import keras
import horovod.tensorflow.keras as hvd
import shutil
# Initialize Horovod
hvd.init()
# Pin GPU to be used to process local rank (one GPU per process)
# These steps are skipped on a CPU cluster
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
(x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(), hvd.size())
model = get_model(num_classes)
# Adjust learning rate based on number of GPUs
optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size())
# Use the Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer)
model.compile(optimizer=optimizer,
loss='categorical_crossentropy',
metrics=['accuracy'])
# Create a callback to broadcast the initial variable states from rank 0 to all other processes.
# This is required to ensure consistent initialization of all workers when training is started with random weights or restored from a checkpoint.
callbacks = [
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
]
# Save checkpoints only on worker 0 to prevent conflicts between workers
if hvd.rank() == 0:
param_str = 'learning_rate_{lr}_batch_size_{bs}'.format(lr=learning_rate, bs=batch_size)
checkpoint_dir_for_this_trial = os.path.join(checkpoint_dir, param_str)
local_ckpt_path = os.path.join(checkpoint_dir_for_this_trial, 'checkpoint-{epoch}.ckpt')
callbacks.append(keras.callbacks.ModelCheckpoint(local_ckpt_path, save_weights_only = True))
model.fit(x_train, y_train,
batch_size=batch_size,
callbacks=callbacks,
epochs=epochs,
verbose=2,
validation_data=(x_test, y_test))
return model.evaluate(x_test, y_test)
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sparkdl import HorovodRunner
def train(params):
"""
An example train method that calls into HorovodRunner.
This method is passed to hyperopt.fmin().
:param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
:return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
"""
hr = HorovodRunner(np=2)
loss, acc = hr.run(train_hvd,
learning_rate=params['learning_rate'],
batch_size=params['batch_size'],
checkpoint_dir=checkpoint_dir)
return {'loss': loss, 'status': STATUS_OK}
algo=tpe.suggest
best_param = fmin(
fn=train,
space=space,
algo=algo,
max_evals=8,
return_argmin=False,
)
0%| | 0/8 [00:00<?, ?trial/s, best loss=?]
HorovodRunner will stream all training logs to notebook cell output. If there are too many logs, you
can adjust the log level in your train method. Or you can set driver_log_verbosity to
'log_callback_only' and use a HorovodRunner log callback on the first worker to get concise
progress updates.
0%| | 0/8 [00:00<?, ?trial/s, best loss=?]
The global names read or written to by the pickled function are {'num_classes', 'epochs', 'os', 'get_model', 'get_dataset'}.
0%| | 0/8 [00:01<?, ?trial/s, best loss=?]
The pickled object size is 4118 bytes.
0%| | 0/8 [00:01<?, ?trial/s, best loss=?]
### How to enable Horovod Timeline? ###
HorovodRunner has the ability to record the timeline of its activity with Horovod Timeline. To
record a Horovod Timeline, set the `HOROVOD_TIMELINE` environment variable to the location of the
timeline file to be created. You can then open the timeline file using the chrome://tracing
facility of the Chrome browser.
0%| | 0/8 [00:01<?, ?trial/s, best loss=?]
/databricks/spark/python/pyspark/sql/context.py:77: DeprecationWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
DeprecationWarning)
0%| | 0/8 [00:01<?, ?trial/s, best loss=?]
Start training.
0%| | 0/8 [00:01<?, ?trial/s, best loss=?]
Warning: Permanently added '10.97.226.252' (ECDSA) to the list of known hosts.
0%| | 0/8 [00:03<?, ?trial/s, best loss=?]
[1,1]<stdout>:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stdout>:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz[1,0]<stdout>:
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stdout>:
[1,0]<stdout>: 8192/11490434 [..............................] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>: 8192/11490434 [..............................] - ETA: 0s[1,0]<stdout>:
753664/11490434 [>.............................] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>: 163840/11490434 [..............................] - ETA: 3s[1,0]<stdout>:
[1,0]<stdout>: 8060928/11490434 [====================>.........] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>: 1556480/11490434 [===>..........................] - ETA: 0s[1,0]<stdout>:
[1,0]<stdout>:11493376/11490434 [==============================] - 0s 0us/step
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,1]<stdout>:
[1,1]<stdout>: 6717440/11490434 [================>.............] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>:11493376/11490434 [==============================] - 0s 0us/step
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stderr>:2020-11-21 00:05:18.219745: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stderr>:2020-11-21 00:05:18.225484: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300020000 Hz
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stderr>:2020-11-21 00:05:18.225882: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5644749b4d40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stderr>:2020-11-21 00:05:18.225919: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
0%| | 0/8 [00:06<?, ?trial/s, best loss=?]
[1,0]<stdout>:Epoch 1/2
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,1]<stderr>:2020-11-21 00:05:18.506161: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,1]<stderr>:2020-11-21 00:05:18.516806: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300000000 Hz
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,1]<stderr>:2020-11-21 00:05:18.517324: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5583bcb0e4e0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,1]<stderr>:2020-11-21 00:05:18.517453: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,1]<stdout>:Epoch 1/2
0%| | 0/8 [00:07<?, ?trial/s, best loss=?]
[1,0]<stderr>:WARNING:tensorflow:Method (on_train_batch_end) is slow compared to the batch update (0.205022). Check your callbacks.
0%| | 0/8 [00:08<?, ?trial/s, best loss=?]
[1,0]<stderr>:Method (on_train_batch_end) is slow compared to the batch update (0.205022). Check your callbacks.
0%| | 0/8 [00:08<?, ?trial/s, best loss=?]
[1,1]<stderr>:WARNING:tensorflow:Method (on_train_batch_end) is slow compared to the batch update (0.204991). Check your callbacks.
0%| | 0/8 [00:08<?, ?trial/s, best loss=?]
[1,1]<stderr>:Method (on_train_batch_end) is slow compared to the batch update (0.204991). Check your callbacks.
0%| | 0/8 [00:08<?, ?trial/s, best loss=?]
[1,1]<stdout>:469/469 - 50s - loss: 1.3476 - accuracy: 0.5864 - val_loss: 0.4570 - val_accuracy: 0.8778
0%| | 0/8 [00:58<?, ?trial/s, best loss=?]
[1,1]<stdout>:Epoch 2/2
0%| | 0/8 [00:58<?, ?trial/s, best loss=?]
[1,0]<stdout>:469/469 - 52s - loss: 1.4570 - accuracy: 0.5450 - val_loss: 0.4637 - val_accuracy: 0.8728
0%| | 0/8 [01:00<?, ?trial/s, best loss=?]
[1,0]<stdout>:Epoch 2/2
0%| | 0/8 [01:00<?, ?trial/s, best loss=?]
[1,1]<stdout>:469/469 - 49s - loss: 0.4792 - accuracy: 0.8567 - val_loss: 0.3240 - val_accuracy: 0.9086
0%| | 0/8 [01:49<?, ?trial/s, best loss=?]
[1,1]<stdout>:
[1,1]<stdout>: 1/157 [..............................] - ETA: 0s - loss: 0.2439 - accuracy: 0.9375[1,1]<stdout>:
[1,1]<stdout>: 6/157 [>.............................] - ETA: 1s - loss: 0.3761 - accuracy: 0.8750[1,1]<stdout>:
[1,1]<stdout>: 10/157 [>.............................] - ETA: 1s - loss: 0.4223 - accuracy: 0.8500[1,1]<stdout>:
[1,1]<stdout>: 14/157 [=>............................] - ETA: 1s - loss: 0.3956 - accuracy: 0.8683[1,1]<stdout>:
[1,1]<stdout>: 18/157 [==>...........................] - ETA: 1s - loss: 0.3960 - accuracy: 0.8767[1,1]<stdout>:
[1,1]<stdout>: 22/157 [===>..........................] - ETA: 1s - loss: 0.4118 - accuracy: 0.8651[1,1]<stdout>:
[1,1]<stdout>: 26/157 [===>..........................] - ETA: 1s - loss: 0.4198 - accuracy: 0.8678[1,1]<stdout>:
[1,1]<stdout>: 30/157 [====>.........................] - ETA: 1s - loss: 0.4181 - accuracy: 0.8698[1,1]<stdout>:
[1,1]<stdout>: 34/157 [=====>........................] - ETA: 1s - loss: 0.4197 - accuracy: 0.8713[1,1]<stdout>:
[1,1]<stdout>: 38/157 [======>.......................] - ETA: 1s - loss: 0.4272 - accuracy: 0.8717[1,1]<stdout>:
[1,1]<stdout>: 43/157 [=======>......................] - ETA: 1s - loss: 0.4142 - accuracy: 0.8765[1,1]<stdout>:
[1,1]<stdout>: 48/157 [========>.....................] - ETA: 1s - loss: 0.4039 - accuracy: 0.8796[1,1]<stdout>:
[1,1]<stdout>: 53/157 [=========>....................] - ETA: 1s - loss: 0.4019 - accuracy: 0.8797[1,1]<stdout>:
[1,1]<stdout>: 58/157 [==========>...................] - ETA: 1s - loss: 0.3961 - accuracy: 0.8809[1,1]<stdout>:
[1,1]<stdout>: 63/157 [===========>..................] - ETA: 1s - loss: 0.4018 - accuracy: 0.8800[1,1]<stdout>:
[1,1]<stdout>: 68/157 [===========>..................] - ETA: 1s - loss: 0.4090 - accuracy: 0.8768[1,1]<stdout>:
[1,1]<stdout>: 73/157 [============>.................] - ETA: 1s - loss: 0.4175 - accuracy: 0.8759[1,1]<stdout>:
[1,1]<stdout>: 78/157 [=============>................] - ETA: 1s - loss: 0.4167 - accuracy: 0.8770[1,1]<stdout>:
[1,1]<stdout>: 83/157 [==============>...............] - ETA: 0s - loss: 0.4041 - accuracy: 0.8803[1,1]<stdout>:
[1,1]<stdout>: 88/157 [===============>..............] - ETA: 0s - loss: 0.3888 - accuracy: 0.8853[1,1]<stdout>:
[1,1]<stdout>: 93/157 [================>........(truncated)
0%| | 0/8 [01:51<?, ?trial/s, best loss=?]
[1,1]<stdout>:
[1,1]<stdout>:145/157 [==========================>...] - ETA: 0s - loss: 0.3234 - accuracy: 0.9082[1,0]<stdout>:
[1,0]<stdout>: 1/157 [..............................][1,0]<stdout>: - ETA: 0s - loss: 0.2246 - accuracy: 0.9375[1,1]<stdout>:
[1,1]<stdout>:149/157 [===========================>..] - ETA: 0s - loss: 0.3176 - accuracy: 0.9102[1,0]<stdout>:
[1,0]<stdout>: 6/157 [>.............................][1,0]<stdout>: - ETA: 1s - loss: 0.2911 - accuracy: 0.9375[1,1]<stdout>:
[1,1]<stdout>:153/157 [============================>.] - ETA: 0s - loss: 0.3182 - accuracy: 0.9103[1,0]<stdout>:
[1,0]<stdout>: 11/157 [=>............................][1,0]<stdout>: - ETA: 1s - loss: 0.3205 - accuracy: 0.9261[1,1]<stdout>:
[1,1]<stdout>:157/157 [==============================] - ETA: 0s - loss: 0.3240 - accuracy: 0.9086[1,1]<stdout>:
[1,1]<stdout>:157/157 [==============================] - 2s 13ms/step - loss: 0.3240 - accuracy: 0.9086
0%| | 0/8 [01:51<?, ?trial/s, best loss=?]
[1,0]<stdout>:
[1,0]<stdout>: 16/157 [==>...........................][1,0]<stdout>: - ETA: 1s - loss: 0.3499 - accuracy: 0.9141[1,0]<stdout>:
[1,0]<stdout>: 21/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.4002 - accuracy: 0.8884[1,0]<stdout>:
[1,0]<stdout>: 26/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.3991 - accuracy: 0.8894[1,0]<stdout>:
[1,0]<stdout>: 31/157 [====>.........................][1,0]<stdout>: - ETA: 1s - loss: 0.4121 - accuracy: 0.8861[1,0]<stdout>:
[1,0]<stdout>: 36/157 [=====>........................][1,0]<stdout>: - ETA: 1s - loss: 0.4327 - accuracy: 0.8819[1,0]<stdout>:
[1,0]<stdout>: 41/157 [======>.......................] - ETA: 1s - loss: 0.4286 - accuracy: 0.8826[1,0]<stdout>:
[1,0]<stdout>: 46/157 [=======>......................][1,0]<stdout>: - ETA: 1s - loss: 0.4243 - accuracy: 0.8811[1,0]<stdout>:
[1,0]<stdout>: 51/157 [========>.....................][1,0]<stdout>: - ETA: 1s - loss: 0.4185 - accuracy: 0.8824[1,0]<stdout>:
56/157 [=========>....................][1,0]<stdout>: - ETA: 1s - loss: 0.4125 - accuracy: 0.8823[1,0]<stdout>:
[1,0]<stdout>: 61/157 [==========>...................][1,0]<stdout>: - ETA: 1s - loss: 0.4250 - accuracy: 0.8770[1,0]<stdout>:
[1,0]<stdout>: 66/157 [===========>..................][1,0]<stdout>: - ETA: 0s - loss: 0.4267 - accuracy: 0.8774[1,0]<stdout>:
[1,0]<stdout>: 71/157 [============>.................][1,0]<stdout>: - ETA: 0s - loss: 0.4246 - accuracy: 0.8776[1,0]<stdout>:
[1,0]<stdout>: 76/157 [=============>................][1,0]<stdout>: - ETA: 0s - loss: 0.4140 - accuracy: 0.8824[1,0]<stdout>:
[1,0]<stdout>: 81/157 [==============>...............][1,0]<stdout>: - ETA: 0s - loss: 0.4154 - accuracy: 0.8812[1,0]<stdout>:
[1,0]<stdout>: 86/157 [===============>..............][1,0]<stdout>: - ETA: 0s - loss: 0.3971 - accuracy: 0.8870[1,0]<stdout>:
[1,0]<stdout>: 91/157 [================>.............][1,0]<stdout>: - ETA: 0s - loss: 0.3896 - accuracy: 0.8891[1,0]<stdout>:
96/157 [=================>............][1,0]<stdout>: - ETA: 0s - loss: 0.3844 - accuracy: 0.8900[1,0]<stdout>:
[1,0]<stdout>:101/157 [==================>...........][1,0]<stdout>: - ETA: 0s - loss: 0.3746 - accuracy: 0.8933[1,0]<stdout>:
[1,0]<stdout>:106/157 [===================>..........][1,0]<stdout>: - ETA: 0s - loss: 0.3719 - accuracy: 0.8953[1,0]<stdout>:...(truncated)
0%| | 0/8 [01:52<?, ?trial/s, best loss=?]
12%|█▎ | 1/8 [01:54<13:22, 114.60s/trial, best loss: 0.32925495505332947]
HorovodRunner will stream all training logs to notebook cell output. If there are too many logs, you
can adjust the log level in your train method. Or you can set driver_log_verbosity to
'log_callback_only' and use a HorovodRunner log callback on the first worker to get concise
progress updates.
12%|█▎ | 1/8 [01:54<13:22, 114.60s/trial, best loss: 0.32925495505332947]
The global names read or written to by the pickled function are {'num_classes', 'epochs', 'os', 'get_model', 'get_dataset'}.
12%|█▎ | 1/8 [01:55<13:22, 114.60s/trial, best loss: 0.32925495505332947]
The pickled object size is 4118 bytes.
12%|█▎ | 1/8 [01:55<13:22, 114.60s/trial, best loss: 0.32925495505332947]
### How to enable Horovod Timeline? ###
HorovodRunner has the ability to record the timeline of its activity with Horovod Timeline. To
record a Horovod Timeline, set the `HOROVOD_TIMELINE` environment variable to the location of the
timeline file to be created. You can then open the timeline file using the chrome://tracing
facility of the Chrome browser.
12%|█▎ | 1/8 [01:55<13:22, 114.60s/trial, best loss: 0.32925495505332947]
Start training.
12%|█▎ | 1/8 [01:56<13:22, 114.60s/trial, best loss: 0.32925495505332947]
Warning: Permanently added '10.97.252.159' (ECDSA) to the list of known hosts.
12%|█▎ | 1/8 [01:58<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stdout>:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
12%|█▎ | 1/8 [02:00<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stdout>:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
12%|█▎ | 1/8 [02:00<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stdout>:
[1,0]<stdout>: 8192/11490434 [..............................] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>: 8192/11490434 [..............................] - ETA: 0s[1,0]<stdout>:
[1,0]<stdout>: 180224/11490434 [..............................] - ETA: 3s[1,1]<stdout>:
[1,1]<stdout>: 1540096/11490434 [===>..........................] - ETA: 0s[1,0]<stdout>:
[1,0]<stdout>: 1982464/11490434 [====>.........................] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>: 7757824/11490434 [===================>..........] - ETA: 0s[1,1]<stdout>:
[1,1]<stdout>:11493376/11490434 [==============================] - 0s 0us/step
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stdout>:
[1,0]<stdout>: 7749632/11490434 [===================>..........] - ETA: 0s[1,0]<stdout>:
[1,0]<stdout>:11493376/11490434 [==============================] - 0s 0us/step
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stderr>:2020-11-21 00:07:12.827714: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stderr>:2020-11-21 00:07:12.833363: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300020000 Hz
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stderr>:2020-11-21 00:07:12.833782: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5608902b9e90 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stderr>:2020-11-21 00:07:12.833816: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stderr>:2020-11-21 00:07:12.879274: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stderr>:2020-11-21 00:07:12.885385: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300000000 Hz
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stderr>:2020-11-21 00:07:12.886135: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x564642495ad0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stderr>:2020-11-21 00:07:12.886175: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,1]<stdout>:Epoch 1/2
12%|█▎ | 1/8 [02:01<13:22, 114.60s/trial, best loss: 0.32925495505332947]
[1,0]<stdout>:Epoch 1/2
*** WARNING: skipped 79780 bytes of output ***
75%|███████▌ | 6/8 [12:59<03:57, 118.82s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:235/235 - 46s - loss: 1.0055 - accuracy: 0.7176 - val_loss: 0.5386 - val_accuracy: 0.8678
75%|███████▌ | 6/8 [13:46<03:57, 118.82s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:
[1,1]<stdout>: 1/157 [..............................] - ETA: 0s - loss: 0.5583 - accuracy: 0.8750[1,1]<stdout>:
[1,1]<stdout>: 6/157 [>.............................] - ETA: 1s - loss: 0.6408 - accuracy: 0.8229[1,1]<stdout>:
[1,1]<stdout>: 11/157 [=>............................] - ETA: 1s - loss: 0.6821 - accuracy: 0.8040[1,1]<stdout>:
[1,1]<stdout>: 16/157 [==>...........................] - ETA: 1s - loss: 0.6318 - accuracy: 0.8262[1,1]<stdout>:
[1,1]<stdout>: 21/157 [===>..........................] - ETA: 1s - loss: 0.6595 - accuracy: 0.8125[1,1]<stdout>:
[1,1]<stdout>: 26/157 [===>..........................] - ETA: 1s - loss: 0.6554 - accuracy: 0.8185[1,1]<stdout>:
[1,1]<stdout>: 31/157 [====>.........................] - ETA: 1s - loss: 0.6491 - accuracy: 0.8256[1,1]<stdout>:
[1,1]<stdout>: 36/157 [=====>........................] - ETA: 1s - loss: 0.6636 - accuracy: 0.8238[1,1]<stdout>:
[1,1]<stdout>: 41/157 [======>.......................] - ETA: 1s - loss: 0.6547 - accuracy: 0.8262[1,1]<stdout>:
[1,1]<stdout>: 46/157 [=======>......................] - ETA: 1s - loss: 0.6438 - accuracy: 0.8322[1,1]<stdout>:
[1,1]<stdout>: 51/157 [========>.....................] - ETA: 1s - loss: 0.6375 - accuracy: 0.8333[1,1]<stdout>:
[1,1]<stdout>: 56/157 [=========>....................] - ETA: 1s - loss: 0.6361 - accuracy: 0.8337[1,1]<stdout>:
[1,1]<stdout>: 61/157 [==========>...................] - ETA: 1s - loss: 0.6390 - accuracy: 0.8340[1,1]<stdout>:
[1,1]<stdout>: 66/157 [===========>..................] - ETA: 0s - loss: 0.6378 - accuracy: 0.8343[1,1]<stdout>:
[1,1]<stdout>: 71/157 [============>.................] - ETA: 0s - loss: 0.6476 - accuracy: 0.8301[1,1]<stdout>:
[1,1]<stdout>: 76/157 [=============>................] - ETA: 0s - loss: 0.6496 - accuracy: 0.8289[1,1]<stdout>:
[1,1]<stdout>: 81/157 [==============>...............] - ETA: 0s - loss: 0.6379 - accuracy: 0.8333[1,1]<stdout>:
[1,1]<stdout>: 86/157 [===============>..............] - ETA: 0s - loss: 0.6191 - accuracy: 0.8394[1,1]<stdout>:
[1,1]<stdout>: 91/157 [================>.............] - ETA: 0s - loss: 0.6081 - accuracy: 0.8431[1,1]<stdout>:
[1,1]<stdout>: 96/157 [=================>............] - ETA: 0s - loss: 0.6093 - accuracy: 0.8405[1,1]<stdout>:
[1,1]<stdout>:101/157 [==================>......(truncated)
75%|███████▌ | 6/8 [13:47<03:57, 118.82s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:235/235 - 48s - loss: 1.1620 - accuracy: 0.6544 - val_loss: 0.5470 - val_accuracy: 0.8632
75%|███████▌ | 6/8 [13:48<03:57, 118.82s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:
1/157 [..............................][1,0]<stdout>: - ETA: 0s - loss: 0.5575 - accuracy: 0.9062[1,0]<stdout>:
[1,0]<stdout>: 6/157 [>.............................][1,0]<stdout>: - ETA: 1s - loss: 0.5308 - accuracy: 0.8958[1,0]<stdout>:
11/157 [=>............................][1,0]<stdout>: - ETA: 1s - loss: 0.5755 - accuracy: 0.8580[1,0]<stdout>:
[1,0]<stdout>: 16/157 [==>...........................][1,0]<stdout>: - ETA: 1s - loss: 0.5887 - accuracy: 0.8496[1,0]<stdout>:
[1,0]<stdout>: 21/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.6402 - accuracy: 0.8333[1,0]<stdout>:
[1,0]<stdout>: 26/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.6343 - accuracy: 0.8389[1,0]<stdout>:
31/157 [====>.........................][1,0]<stdout>: - ETA: 1s - loss: 0.6441 - accuracy: 0.8367[1,0]<stdout>:
36/157 [=====>........................][1,0]<stdout>: - ETA: 1s - loss: 0.6592 - accuracy: 0.8307[1,0]<stdout>:
[1,0]<stdout>: 41/157 [======>.......................][1,0]<stdout>: - ETA: 1s - loss: 0.6545 - accuracy: 0.8293[1,0]<stdout>:
[1,0]<stdout>: 46/157 [=======>......................][1,0]<stdout>: - ETA: 1s - loss: 0.6529 - accuracy: 0.8295[1,0]<stdout>:
[1,0]<stdout>: 51/157 [========>.....................][1,0]<stdout>: - ETA: 1s - loss: 0.6458 - accuracy: 0.8321[1,0]<stdout>:
[1,0]<stdout>: 56/157 [=========>....................][1,0]<stdout>: - ETA: 1s - loss: 0.6411 - accuracy: 0.8331[1,0]<stdout>:
61/157 [==========>...................][1,0]<stdout>: - ETA: 0s - loss: 0.6517 - accuracy: 0.8289[1,0]<stdout>:
[1,0]<stdout>: 66/157 [===========>..................][1,0]<stdout>: - ETA: 0s - loss: 0.6519 - accuracy: 0.8291[1,0]<stdout>:
[1,0]<stdout>: 71/157 [============>.................][1,0]<stdout>: - ETA: 0s - loss: 0.6493 - accuracy: 0.8288[1,0]<stdout>:
[1,0]<stdout>: 76/157 [=============>................][1,0]<stdout>: - ETA: 0s - loss: 0.6416 - accuracy: 0.8335[1,0]<stdout>:
[1,0]<stdout>: 81/157 [==============>...............][1,0]<stdout>: - ETA: 0s - loss: 0.6430 - accuracy: 0.8318[1,0]<stdout>:
[1,0]<stdout>: 86/157 [===============>..............][1,0]<stdout>: - ETA: 0s - loss: 0.6224 - accuracy: 0.8398[1,0]<stdout>:
[1,0]<stdout>: 91/157 [================>.............][1,0]<stdout>: - ETA: 0s - loss: 0.6115 - accuracy: 0.8434[1,0]<stdout>:
96/157 [=================>............][1,0]<...(truncated)
75%|███████▌ | 6/8 [13:49<03:57, 118.82s/trial, best loss: 0.32247650623321533]
88%|████████▊ | 7/8 [13:51<01:55, 115.92s/trial, best loss: 0.32247650623321533]
HorovodRunner will stream all training logs to notebook cell output. If there are too many logs, you
can adjust the log level in your train method. Or you can set driver_log_verbosity to
'log_callback_only' and use a HorovodRunner log callback on the first worker to get concise
progress updates.
88%|████████▊ | 7/8 [13:51<01:55, 115.92s/trial, best loss: 0.32247650623321533]
The global names read or written to by the pickled function are {'num_classes', 'epochs', 'os', 'get_model', 'get_dataset'}.
88%|████████▊ | 7/8 [13:52<01:55, 115.92s/trial, best loss: 0.32247650623321533]
The pickled object size is 4118 bytes.
88%|████████▊ | 7/8 [13:52<01:55, 115.92s/trial, best loss: 0.32247650623321533]
### How to enable Horovod Timeline? ###
HorovodRunner has the ability to record the timeline of its activity with Horovod Timeline. To
record a Horovod Timeline, set the `HOROVOD_TIMELINE` environment variable to the location of the
timeline file to be created. You can then open the timeline file using the chrome://tracing
facility of the Chrome browser.
88%|████████▊ | 7/8 [13:52<01:55, 115.92s/trial, best loss: 0.32247650623321533]
Start training.
88%|████████▊ | 7/8 [13:53<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:2020-11-21 00:19:09.131251: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:2020-11-21 00:19:09.137038: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300000000 Hz
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:2020-11-21 00:19:09.137529: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x564d6ae886a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:2020-11-21 00:19:09.137612: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:2020-11-21 00:19:09.140803: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:2020-11-21 00:19:09.146452: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300020000 Hz
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:2020-11-21 00:19:09.146883: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b4d4a4ad40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:2020-11-21 00:19:09.146922: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
88%|████████▊ | 7/8 [13:57<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:Epoch 1/2
88%|████████▊ | 7/8 [13:58<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:Epoch 1/2
88%|████████▊ | 7/8 [13:58<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:WARNING:tensorflow:Method (on_train_batch_end) is slow compared to the batch update (0.202801). Check your callbacks.
88%|████████▊ | 7/8 [13:59<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stderr>:Method (on_train_batch_end) is slow compared to the batch update (0.202801). Check your callbacks.
88%|████████▊ | 7/8 [13:59<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:WARNING:tensorflow:Method (on_train_batch_end) is slow compared to the batch update (0.202857). Check your callbacks.
88%|████████▊ | 7/8 [13:59<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stderr>:Method (on_train_batch_end) is slow compared to the batch update (0.202857). Check your callbacks.
88%|████████▊ | 7/8 [13:59<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:469/469 - 50s - loss: 0.6956 - accuracy: 0.7900 - val_loss: 0.2689 - val_accuracy: 0.9234
88%|████████▊ | 7/8 [14:48<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:Epoch 2/2
88%|████████▊ | 7/8 [14:48<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:469/469 - 52s - loss: 0.8458 - accuracy: 0.7414 - val_loss: 0.2670 - val_accuracy: 0.9224
88%|████████▊ | 7/8 [14:51<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:Epoch 2/2
88%|████████▊ | 7/8 [14:51<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:469/469 - 50s - loss: 0.2919 - accuracy: 0.9136 - val_loss: 0.2125 - val_accuracy: 0.9414
88%|████████▊ | 7/8 [15:40<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,1]<stdout>:
[1,1]<stdout>: 1/157 [..............................] - ETA: 0s - loss: 0.1286 - accuracy: 0.9688[1,1]<stdout>:
[1,1]<stdout>: 6/157 [>.............................] - ETA: 1s - loss: 0.2256 - accuracy: 0.9323[1,1]<stdout>:
[1,1]<stdout>: 11/157 [=>............................] - ETA: 1s - loss: 0.2748 - accuracy: 0.9119[1,1]<stdout>:
[1,1]<stdout>: 16/157 [==>...........................] - ETA: 1s - loss: 0.2483 - accuracy: 0.9219[1,1]<stdout>:
[1,1]<stdout>: 21/157 [===>..........................] - ETA: 1s - loss: 0.2670 - accuracy: 0.9196[1,1]<stdout>:
[1,1]<stdout>: 26/157 [===>..........................] - ETA: 1s - loss: 0.2725 - accuracy: 0.9207[1,1]<stdout>:
[1,1]<stdout>: 31/157 [====>.........................] - ETA: 1s - loss: 0.2736 - accuracy: 0.9204[1,1]<stdout>:
[1,1]<stdout>: 35/157 [=====>........................] - ETA: 1s - loss: 0.2809 - accuracy: 0.9205[1,1]<stdout>:
[1,1]<stdout>: 39/157 [======>.......................] - ETA: 1s - loss: 0.2797 - accuracy: 0.9207[1,1]<stdout>:
[1,1]<stdout>: 44/157 [=======>......................] - ETA: 1s - loss: 0.2656 - accuracy: 0.9254[1,1]<stdout>:
[1,1]<stdout>: 49/157 [========>.....................] - ETA: 1s - loss: 0.2637 - accuracy: 0.9273[1,1]<stdout>:
[1,1]<stdout>: 54/157 [=========>....................] - ETA: 1s - loss: 0.2565 - accuracy: 0.9282[1,1]<stdout>:
[1,1]<stdout>: 59/157 [==========>...................] - ETA: 1s - loss: 0.2600 - accuracy: 0.9280[1,1]<stdout>:
[1,1]<stdout>: 64/157 [===========>..................] - ETA: 1s - loss: 0.2635 - accuracy: 0.9263[1,1]<stdout>:
[1,1]<stdout>: 68/157 [===========>..................] - ETA: 1s - loss: 0.2675 - accuracy: 0.9246[1,1]<stdout>:
[1,1]<stdout>: 72/157 [============>.................] - ETA: 1s - loss: 0.2712 - accuracy: 0.9214[1,1]<stdout>:
[1,1]<stdout>: 77/157 [=============>................] - ETA: 0s - loss: 0.2727 - accuracy: 0.9217[1,1]<stdout>:
[1,1]<stdout>: 82/157 [==============>...............] - ETA: 0s - loss: 0.2648 - accuracy: 0.9238[1,1]<stdout>:
[1,1]<stdout>: 87/157 [===============>..............] - ETA: 0s - loss: 0.2545 - accuracy: 0.9274[1,1]<stdout>:
[1,1]<stdout>: 91/157 [================>.............] - ETA: 0s - loss: 0.2495 - accuracy: 0.9282[1,1]<stdout>:
[1,1]<stdout>: 96/157 [=================>.......(truncated)
88%|████████▊ | 7/8 [15:42<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:469/469 - 52s - loss: 0.4469 - accuracy: 0.8601 - val_loss: 0.2095 - val_accuracy: 0.9382
88%|████████▊ | 7/8 [15:43<01:55, 115.92s/trial, best loss: 0.32247650623321533]
[1,0]<stdout>:
[1,0]<stdout>: 1/157 [..............................][1,0]<stdout>: - ETA: 0s - loss: 0.1444 - accuracy: 0.9688[1,0]<stdout>:
[1,0]<stdout>: 6/157 [>.............................][1,0]<stdout>: - ETA: 1s - loss: 0.1654 - accuracy: 0.9583[1,0]<stdout>:
[1,0]<stdout>: 11/157 [=>............................][1,0]<stdout>: - ETA: 1s - loss: 0.1801 - accuracy: 0.9460[1,0]<stdout>:
[1,0]<stdout>: 16/157 [==>...........................][1,0]<stdout>: - ETA: 1s - loss: 0.1982 - accuracy: 0.9355[1,0]<stdout>:
[1,0]<stdout>: 21/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.2352 - accuracy: 0.9182[1,0]<stdout>:
[1,0]<stdout>: 26/157 [===>..........................][1,0]<stdout>: - ETA: 1s - loss: 0.2382 - accuracy: 0.9219[1,0]<stdout>:
[1,0]<stdout>: 31/157 [====>.........................][1,0]<stdout>: - ETA: 1s - loss: 0.2543 - accuracy: 0.9173[1,0]<stdout>:
[1,0]<stdout>: 36/157 [=====>........................][1,0]<stdout>: - ETA: 1s - loss: 0.2762 - accuracy: 0.9149[1,0]<stdout>:
[1,0]<stdout>: 41/157 [======>.......................][1,0]<stdout>: - ETA: 1s - loss: 0.2721 - accuracy: 0.9177[1,0]<stdout>:
[1,0]<stdout>: 46/157 [=======>......................][1,0]<stdout>: - ETA: 1s - loss: 0.2687 - accuracy: 0.9185[1,0]<stdout>:
[1,0]<stdout>: 51/157 [========>.....................][1,0]<stdout>: - ETA: 1s - loss: 0.2657 - accuracy: 0.9197[1,0]<stdout>:
[1,0]<stdout>: 56/157 [=========>....................][1,0]<stdout>: - ETA: 1s - loss: 0.2598 - accuracy: 0.9224[1,0]<stdout>:
[1,0]<stdout>: 61/157 [==========>...................][1,0]<stdout>: - ETA: 0s - loss: 0.2687 - accuracy: 0.9191[1,0]<stdout>:
[1,0]<stdout>: 66/157 [===========>..................][1,0]<stdout>: - ETA: 0s - loss: 0.2734 - accuracy: 0.9186[1,0]<stdout>:
71/157 [============>.................][1,0]<stdout>: - ETA: 0s - loss: 0.2721 - accuracy: 0.9190[1,0]<stdout>:
[1,0]<stdout>: 76/157 [=============>................][1,0]<stdout>: - ETA: 0s - loss: 0.2624 - accuracy: 0.9231[1,0]<stdout>:
[1,0]<stdout>: 81/157 [==============>...............][1,0]<stdout>: - ETA: 0s - loss: 0.2640 - accuracy: 0.9221[1,0]<stdout>:
[1,0]<stdout>: 86/157 [===============>..............][1,0]<stdout>: - ETA: 0s - loss: 0.2522 - accuracy: 0.9259[1,0]<stdout>:
[1,0]<stdout>: 91/157 [================>.............][1,0]<stdout>: - ETA: 0s - loss: 0.2494 - accuracy: 0.9269[1,0]<stdout>:...(truncated)
88%|████████▊ | 7/8 [15:44<01:55, 115.92s/trial, best loss: 0.32247650623321533]
100%|██████████| 8/8 [15:46<00:00, 115.65s/trial, best loss: 0.20950095355510712]
100%|██████████| 8/8 [15:46<00:00, 118.30s/trial, best loss: 0.20950095355510712]
# Display the contents of the checkpoint directory
os.listdir(checkpoint_dir)
Out[9]: ['learning_rate_0.0004052647862990721_batch_size_32',
'learning_rate_0.0017289936598946995_batch_size_64',
'learning_rate_0.0017562610422950944_batch_size_32',
'learning_rate_0.002643185458178416_batch_size_64',
'learning_rate_0.005611472135413056_batch_size_128',
'learning_rate_0.008228267590068774_batch_size_64',
'learning_rate_0.012489369904197382_batch_size_128',
'learning_rate_0.025488406267934912_batch_size_64']
Distributed training with Hyperopt and HorovodRunner
Databricks Runtime for Machine Learning includes Hyperopt, a library for ML hyperparameter tuning in Python, and HorovodRunner, a general API to run distributed deep learning workloads on Databricks using the Horovod framework.
Use case Distributed deep learning workloads in Python for which you want to tune hyperparameters.
In this example notebook
The demo is from Hyperopt's documentation and HorovodRunner documentation with minor adjustments.
This guide consists of the following sections:
Requirements