From eb5502023134e9f924977f4a486a1deb6cc1b869 Mon Sep 17 00:00:00 2001 From: FrancescoPinto Date: Tue, 2 Feb 2021 16:32:44 +0000 Subject: [PATCH] changed keras --- Upgrading_From_Edward_To_Edward2.md | 6 +- edward2/tensorflow/constraints.py | 24 ++-- edward2/tensorflow/constraints_test.py | 2 +- edward2/tensorflow/initializers.py | 60 +++++----- edward2/tensorflow/initializers_test.py | 6 +- edward2/tensorflow/layers/README.md | 36 +++--- .../layers/bayesian_linear_model.py | 2 +- edward2/tensorflow/layers/convolutional.py | 82 +++++++------- .../tensorflow/layers/convolutional_test.py | 24 ++-- edward2/tensorflow/layers/dense.py | 68 +++++------ edward2/tensorflow/layers/dense_test.py | 32 +++--- edward2/tensorflow/layers/discrete_flows.py | 8 +- edward2/tensorflow/layers/embeddings.py | 4 +- edward2/tensorflow/layers/embeddings_test.py | 8 +- edward2/tensorflow/layers/gaussian_process.py | 22 ++-- .../layers/gaussian_process_test.py | 4 +- edward2/tensorflow/layers/heteroscedastic.py | 86 +++++++------- .../tensorflow/layers/heteroscedastic_test.py | 28 ++--- edward2/tensorflow/layers/made.py | 18 +-- edward2/tensorflow/layers/neural_process.py | 24 ++-- .../tensorflow/layers/neural_process_test.py | 2 +- edward2/tensorflow/layers/noise.py | 36 +++--- edward2/tensorflow/layers/normalization.py | 26 ++--- .../tensorflow/layers/normalization_test.py | 4 +- edward2/tensorflow/layers/random_feature.py | 22 ++-- .../tensorflow/layers/random_feature_test.py | 6 +- edward2/tensorflow/layers/recurrent.py | 84 +++++++------- edward2/tensorflow/layers/recurrent_test.py | 16 +-- .../tensorflow/layers/stochastic_output.py | 12 +- .../layers/stochastic_output_test.py | 4 +- edward2/tensorflow/layers/utils.py | 4 +- edward2/tensorflow/layers/utils_test.py | 4 +- edward2/tensorflow/regularizers.py | 38 +++---- edward2/tensorflow/regularizers_test.py | 2 +- .../transformed_random_variable_test.py | 2 +- examples/notebooks/Companion.ipynb | 46 ++++---- .../attentive_uncertainty/attention.py | 26 ++--- .../colabs/2019_09_11_gnp_1d_train.ipynb | 16 +-- .../generalized_neural_process.py | 4 +- experimental/attentive_uncertainty/layers.py | 14 +-- .../attentive_uncertainty/regressor.py | 4 +- experimental/attentive_uncertainty/utils.py | 6 +- .../auxiliary_sampling/compute_metrics.py | 2 +- .../deterministic_baseline/lenet5.py | 22 ++-- .../run_det_training.py | 8 +- experimental/auxiliary_sampling/lenet5.py | 12 +- experimental/auxiliary_sampling/res_net.py | 8 +- .../auxiliary_sampling/run_training.py | 12 +- .../marginalization_mixup/batchensemble.py | 60 +++++----- .../batchensemble_model.py | 46 ++++---- .../marginalization_mixup/deterministic.py | 86 +++++++------- experimental/marginalization_mixup/dropout.py | 28 ++--- .../marginalization_mixup/ensemble_layers.py | 12 +- .../marginalization_mixup/naive_ensembles.py | 106 +++++++++--------- experimental/marginalization_mixup/sngp.py | 32 +++--- .../temperature_scaling.py | 20 ++-- experimental/mimo/cifar.py | 38 +++---- experimental/mimo/cifar_model.py | 26 ++--- experimental/mimo/cifar_model_reg_path.py | 32 +++--- experimental/mimo/cifar_model_test.py | 2 +- experimental/mimo/cifar_reg_path.py | 40 +++---- experimental/mimo/imagenet.py | 34 +++--- experimental/mimo/imagenet_model.py | 48 ++++---- experimental/mimo/imagenet_model_test.py | 2 +- experimental/mimo/layers.py | 2 +- experimental/rank1_bnns/cifar-refined-vi.py | 44 ++++---- experimental/rank1_bnns/cifar.py | 44 ++++---- experimental/rank1_bnns/cifar_model.py | 20 ++-- experimental/rank1_bnns/imagenet.py | 56 ++++----- experimental/rank1_bnns/imagenet_model.py | 24 ++-- experimental/rank1_bnns/resnet_cifar_main.py | 44 ++++---- experimental/rank1_bnns/resnet_cifar_model.py | 20 ++-- .../rank1_bnns/resnet_cifar_model_test.py | 2 +- experimental/rank1_bnns/utils.py | 12 +- experimental/sngp/gaussian_process_test.py | 6 +- experimental/sngp/normalization_test.py | 4 +- 76 files changed, 938 insertions(+), 938 deletions(-) diff --git a/Upgrading_From_Edward_To_Edward2.md b/Upgrading_From_Edward_To_Edward2.md index d7024d30..068b1233 100644 --- a/Upgrading_From_Edward_To_Edward2.md +++ b/Upgrading_From_Edward_To_Edward2.md @@ -185,7 +185,7 @@ approximation—another Edward2 program—and apply tracers to write the evidence lower bound (Hinton & Camp, 1993; Jordan, Ghahramani, Jaakkola, & Saul, 1999; Waterhouse, MacKay, & Robinson, 1996). Note we use factory functions (functions which build other functions) for simplicity, but you can also use -`tf.keras.Models` as stateful classes which automatically manage the variables. +`tf.python.keras.Models` as stateful classes which automatically manage the variables. ```python def build_trainable_positive_pointmass(shape, name=None): @@ -259,7 +259,7 @@ def train_step(bag_of_words, step): with writer.default(): tf.summary.scalar("elbo", elbo, step=step) loss = -elbo - optimizer = tf.keras.optimizers.Adam(1e-3) + optimizer = tf.python.keras.optimizers.Adam(1e-3) gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss @@ -375,7 +375,7 @@ observed_statistics, replicated_statistics = ed.ppc( ``` __Edward2__. Build the metric manually or use TensorFlow -abstractions such as `tf.keras.metrics`. +abstractions such as `tf.python.keras.metrics`. ```python # See posterior_predictive built in Variational Inference section. diff --git a/edward2/tensorflow/constraints.py b/edward2/tensorflow/constraints.py index f55306a2..8097370c 100644 --- a/edward2/tensorflow/constraints.py +++ b/edward2/tensorflow/constraints.py @@ -15,7 +15,7 @@ """Constraints. -One subtlety is how Bayesian Layers uses `tf.keras.constraints`. Typically, +One subtlety is how Bayesian Layers uses `tf.python.keras.constraints`. Typically, Keras constraints are used with projected gradient descent, where one performs unconstrained optimization and then applies a projection (the constraint) after each gradient update. To stay in line with probabilistic literature, trainable @@ -27,10 +27,10 @@ import tensorflow as tf -class Exp(tf.keras.constraints.Constraint): +class Exp(tf.python.keras.constraints.Constraint): """Exp constraint.""" - def __init__(self, epsilon=tf.keras.backend.epsilon()): + def __init__(self, epsilon=tf.python.keras.backend.epsilon()): self.epsilon = epsilon def __call__(self, w): @@ -40,10 +40,10 @@ def get_config(self): return {'epsilon': self.epsilon} -class Positive(tf.keras.constraints.Constraint): +class Positive(tf.python.keras.constraints.Constraint): """Positive constraint.""" - def __init__(self, epsilon=tf.keras.backend.epsilon()): + def __init__(self, epsilon=tf.python.keras.backend.epsilon()): self.epsilon = epsilon def __call__(self, w): @@ -53,10 +53,10 @@ def get_config(self): return {'epsilon': self.epsilon} -class Softplus(tf.keras.constraints.Constraint): +class Softplus(tf.python.keras.constraints.Constraint): """Softplus constraint.""" - def __init__(self, epsilon=tf.keras.backend.epsilon()): + def __init__(self, epsilon=tf.python.keras.backend.epsilon()): self.epsilon = epsilon def __call__(self, w): @@ -66,7 +66,7 @@ def get_config(self): return {'epsilon': self.epsilon} -# Compatibility aliases, following tf.keras +# Compatibility aliases, following tf.python.keras # pylint: disable=invalid-name exp = Exp @@ -74,15 +74,15 @@ def get_config(self): softplus = Softplus # pylint: enable=invalid-name -# Utility functions, following tf.keras +# Utility functions, following tf.python.keras def serialize(initializer): - return tf.keras.utils.serialize_keras_object(initializer) + return tf.python.keras.utils.serialize_keras_object(initializer) def deserialize(config, custom_objects=None): - return tf.keras.utils.deserialize_keras_object( + return tf.python.keras.utils.deserialize_keras_object( config, module_objects=globals(), custom_objects=custom_objects, @@ -108,4 +108,4 @@ def get(identifier, value=None): pass elif callable(identifier): return identifier - return tf.keras.constraints.get(value) + return tf.python.keras.constraints.get(value) diff --git a/edward2/tensorflow/constraints_test.py b/edward2/tensorflow/constraints_test.py index b349edff..3d32c282 100644 --- a/edward2/tensorflow/constraints_test.py +++ b/edward2/tensorflow/constraints_test.py @@ -37,7 +37,7 @@ def testConstraintsGet(self): self.assertIsInstance(ed.constraints.get('positive'), ed.constraints.Positive) self.assertIsInstance(ed.constraints.get('non_neg'), - tf.keras.constraints.NonNeg) + tf.python.keras.constraints.NonNeg) if __name__ == '__main__': diff --git a/edward2/tensorflow/initializers.py b/edward2/tensorflow/initializers.py index 4493a93e..08ded906 100644 --- a/edward2/tensorflow/initializers.py +++ b/edward2/tensorflow/initializers.py @@ -15,13 +15,13 @@ """Initializers. -This module extends `tf.keras.initializers` with the notion of "trainable -initializers", where initializers to weights and biases in `tf.keras.layers` may +This module extends `tf.python.keras.initializers` with the notion of "trainable +initializers", where initializers to weights and biases in `tf.python.keras.layers` may themselves carry parameters. For example, consider a weight initializer which returns a variational distribution: this is reified as an `ed.RandomVariable` parameterized by `tf.Variables`. -One subtlety is how `tf.keras.constraints` are used on the parameters of +One subtlety is how `tf.python.keras.constraints` are used on the parameters of trainable initializers. Typically, Keras constraints are used with projected gradient descent, where one performs unconstrained optimization and then applies a projection (the constraint) after each gradient update. To stay in line with @@ -110,7 +110,7 @@ def _compute_fans(shape): return fan_in, fan_out -class ScaledNormalStdDev(tf.keras.initializers.VarianceScaling): +class ScaledNormalStdDev(tf.python.keras.initializers.VarianceScaling): """Initializer capable of adapting its scale to the shape of weights tensors. This initializes the standard deviation parameter of a Trainable Normal @@ -173,7 +173,7 @@ def __call__(self, shape, dtype=None): dtype=dtype, seed=self.seed) -class TrainableDeterministic(tf.keras.layers.Layer): +class TrainableDeterministic(tf.python.keras.layers.Layer): """Deterministic point-wise initializer with trainable location.""" def __init__(self, @@ -225,13 +225,13 @@ def get_config(self): } -class TrainableHalfCauchy(tf.keras.layers.Layer): +class TrainableHalfCauchy(tf.python.keras.layers.Layer): """Half-Cauchy distribution initializer with trainable parameters.""" def __init__(self, - loc_initializer=tf.keras.initializers.TruncatedNormal( + loc_initializer=tf.python.keras.initializers.TruncatedNormal( stddev=1e-5), - scale_initializer=tf.keras.initializers.TruncatedNormal( + scale_initializer=tf.python.keras.initializers.TruncatedNormal( mean=-3., stddev=0.1), loc_regularizer=None, scale_regularizer=None, @@ -303,13 +303,13 @@ def get_config(self): } -class TrainableCauchy(tf.keras.layers.Layer): +class TrainableCauchy(tf.python.keras.layers.Layer): """Cauchy distribution initializer with trainable parameters.""" def __init__( self, - loc_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-5), - scale_initializer=tf.keras.initializers.TruncatedNormal( + loc_initializer=tf.python.keras.initializers.TruncatedNormal(stddev=1e-5), + scale_initializer=tf.python.keras.initializers.TruncatedNormal( mean=-3., stddev=0.1), loc_regularizer=None, scale_regularizer=None, @@ -374,13 +374,13 @@ def get_config(self): } -class TrainableLogNormal(tf.keras.layers.Layer): +class TrainableLogNormal(tf.python.keras.layers.Layer): """Random log normal op as an initializer with trainable loc and scale.""" def __init__(self, - loc_initializer=tf.keras.initializers.TruncatedNormal( + loc_initializer=tf.python.keras.initializers.TruncatedNormal( stddev=1e-5), - scale_initializer=tf.keras.initializers.TruncatedNormal( + scale_initializer=tf.python.keras.initializers.TruncatedNormal( mean=-3., stddev=0.1), loc_regularizer=None, scale_regularizer=None, @@ -451,13 +451,13 @@ def get_config(self): } -class TrainableNormal(tf.keras.layers.Layer): +class TrainableNormal(tf.python.keras.layers.Layer): """Random normal op as an initializer with trainable mean and stddev.""" def __init__(self, - mean_initializer=tf.keras.initializers.TruncatedNormal( + mean_initializer=tf.python.keras.initializers.TruncatedNormal( stddev=1e-5), - stddev_initializer=tf.keras.initializers.TruncatedNormal( + stddev_initializer=tf.python.keras.initializers.TruncatedNormal( mean=-3., stddev=0.1), mean_regularizer=None, stddev_regularizer=None, @@ -544,7 +544,7 @@ class TrainableHeNormal(TrainableNormal): def __init__(self, seed=None, **kwargs): super(TrainableHeNormal, self).__init__( - mean_initializer=tf.keras.initializers.he_normal(seed), + mean_initializer=tf.python.keras.initializers.he_normal(seed), seed=seed, **kwargs) @@ -570,7 +570,7 @@ class TrainableGlorotNormal(TrainableNormal): def __init__(self, seed=None, **kwargs): super(TrainableGlorotNormal, self).__init__( - mean_initializer=tf.keras.initializers.GlorotNormal(seed), + mean_initializer=tf.python.keras.initializers.GlorotNormal(seed), seed=seed, **kwargs) @@ -611,12 +611,12 @@ def build(self, shape, dtype=None): self.built = True -class TrainableNormalFixedStddev(tf.keras.layers.Layer): +class TrainableNormalFixedStddev(tf.python.keras.layers.Layer): """Random normal op as an initializer with trainable mean and fixed stddev.""" def __init__(self, stddev=1., - mean_initializer=tf.keras.initializers.TruncatedNormal( + mean_initializer=tf.python.keras.initializers.TruncatedNormal( stddev=1e-5), mean_regularizer=None, mean_constraint=None, @@ -664,7 +664,7 @@ def get_config(self): } -class RandomSign(tf.keras.initializers.Initializer): +class RandomSign(tf.python.keras.initializers.Initializer): """Initializer that generates tensors initialized to +/- 1. Attributes: @@ -694,12 +694,12 @@ def get_config(self): } -class TrainableMixtureOfDeltas(tf.keras.layers.Layer): +class TrainableMixtureOfDeltas(tf.python.keras.layers.Layer): """Mixture of deltas as an initializer with trainable locations.""" def __init__(self, num_components=5, - loc_initializer=tf.keras.initializers.he_normal(), + loc_initializer=tf.python.keras.initializers.he_normal(), loc_regularizer=None, loc_constraint=None, seed=None, @@ -756,7 +756,7 @@ def get_config(self): } -class OrthogonalRandomFeatures(tf.keras.initializers.Orthogonal): +class OrthogonalRandomFeatures(tf.python.keras.initializers.Orthogonal): """Generates a orthogonal Gaussian matrix for a random feature Dense layer. Generates a 2D matrix of form W = stddev * Q @ S [1], where Q is a random @@ -829,7 +829,7 @@ def get_config(self): config.update(new_config) return config -# Compatibility aliases, following tf.keras +# Compatibility aliases, following tf.python.keras # pylint: disable=invalid-name scaled_normal_std_dev = ScaledNormalStdDev @@ -847,15 +847,15 @@ def get_config(self): orthogonal_random_features = OrthogonalRandomFeatures # pylint: enable=invalid-name -# Utility functions, following tf.keras +# Utility functions, following tf.python.keras def serialize(initializer): - return tf.keras.utils.serialize_keras_object(initializer) + return tf.python.keras.utils.serialize_keras_object(initializer) def deserialize(config, custom_objects=None): - return tf.keras.utils.deserialize_keras_object( + return tf.python.keras.utils.deserialize_keras_object( config, module_objects=globals(), custom_objects=custom_objects, @@ -881,4 +881,4 @@ def get(identifier, value=None): pass elif callable(identifier): return identifier - return tf.keras.initializers.get(value) + return tf.python.keras.initializers.get(value) diff --git a/edward2/tensorflow/initializers_test.py b/edward2/tensorflow/initializers_test.py index fa1993cd..10b3432b 100644 --- a/edward2/tensorflow/initializers_test.py +++ b/edward2/tensorflow/initializers_test.py @@ -136,11 +136,11 @@ def testInitializersGet(self): self.assertIsInstance(ed.initializers.get('trainable_normal'), ed.initializers.TrainableNormal) # This is working correctly, but the test won't pass currently because TF - # isn't consistent (yet). Specifically, tf.keras.initializers.get('zeros') - # returns a certain class while tf.keras.initializers.zeros (or Zeros) + # isn't consistent (yet). Specifically, tf.python.keras.initializers.get('zeros') + # returns a certain class while tf.python.keras.initializers.zeros (or Zeros) # currently returns v2 of that class. # self.assertIsInstance(ed.initializers.get('zeros'), - # tf.keras.initializers.Zeros().__class__) + # tf.python.keras.initializers.Zeros().__class__) if __name__ == '__main__': diff --git a/edward2/tensorflow/layers/README.md b/edward2/tensorflow/layers/README.md index 45b11c1c..89828458 100644 --- a/edward2/tensorflow/layers/README.md +++ b/edward2/tensorflow/layers/README.md @@ -54,13 +54,13 @@ features, labels = ... total_dataset_size = ... # Define the model. -model = tf.keras.Sequential([ +model = tf.python.keras.Sequential([ ed.layers.Conv2DFlipout(32, (3, 3), activation='relu'), - tf.keras.layers.MaxPooling2D((2, 2)), + tf.python.keras.layers.MaxPooling2D((2, 2)), ed.layers.Conv2DFlipout(64, (3, 3), activation='relu'), - tf.keras.layers.MaxPooling2D((2, 2)), + tf.python.keras.layers.MaxPooling2D((2, 2)), ed.layers.Conv2DFlipout(64, (3, 3), activation='relu'), - tf.keras.layers.Flatten(), + tf.python.keras.layers.Flatten(), ed.layers.DenseVariationalDropout(64, activation='relu'), ed.layers.DenseVariationalDropout(10), ]) @@ -113,7 +113,7 @@ Gaussian processes represent distributions over functions by specifying the value of the function at different inputs. GPs have the same design points: * __Computing the integral.__ Each estimator is its own Layer. This includes `ed.layers.GaussianProcess` for exact (albeit expensive) integration and `ed.layers.SparseGaussianProcess` for inducing variable approximations. -* __Type Signature.__ For the equivalent deterministic layer, GPs maintain its typical arguments as well as tensor-shaped inputs and outputs. For example, `units` in a Gaussian process layer determine the GP's output dimensionality, where `ed.layers.GaussianProcess(32)` is the Bayesian nonparametric extension of `tf.keras.layers.Dense(32)`. Instead of an `activation` function argument, GP layers have mean and covariance function arguments which default to the zero function and squared exponential kernel respectively. +* __Type Signature.__ For the equivalent deterministic layer, GPs maintain its typical arguments as well as tensor-shaped inputs and outputs. For example, `units` in a Gaussian process layer determine the GP's output dimensionality, where `ed.layers.GaussianProcess(32)` is the Bayesian nonparametric extension of `tf.python.keras.layers.Dense(32)`. Instead of an `activation` function argument, GP layers have mean and covariance function arguments which default to the zero function and squared exponential kernel respectively. * __Distribution regularizers.__ To specify regularizers such as the KL penalty in variational inference, use Keras' `kernel_regularizer` and `bias_regularizer`. See [`ed.regularizers`](https://github.com/google/edward2/blob/master/edward2/tensorflow/regularizers.py) for Bayesian Layers' built-in additions. Here's a snippet of what typical code looks like. We use a 3-layer deep GP @@ -121,8 +121,8 @@ trained with variational inference. ```python # Define the model. -model = tf.keras.Sequential([ - tf.keras.layers.Flatten(), +model = tf.python.keras.Sequential([ + tf.python.keras.layers.Flatten(), ed.layers.SparseGaussianProcess(256, num_inducing=512), ed.layers.SparseGaussianProcess(256, num_inducing=512), ed.layers.SparseGaussianProcess(3, num_inducing=512), @@ -163,18 +163,18 @@ autoencoder. ```python # Define the model. -encoder = tf.keras.Sequential([ - tf.keras.layers.Conv2D(128, 5, 1, padding='same', activation='relu'), - tf.keras.layers.Conv2D(128, 5, 2, padding='same', activation='relu'), - tf.keras.layers.Conv2D(512, 7, 1, padding='valid', activation='relu'), +encoder = tf.python.keras.Sequential([ + tf.python.keras.layers.Conv2D(128, 5, 1, padding='same', activation='relu'), + tf.python.keras.layers.Conv2D(128, 5, 2, padding='same', activation='relu'), + tf.python.keras.layers.Conv2D(512, 7, 1, padding='valid', activation='relu'), ed.layers.Normal(name='latent_code'), ]) -decoder = tf.keras.Sequential([ - tf.keras.layers.Conv2DTranspose(256, 7, 1, padding='valid', activation='relu'), - tf.keras.layers.Conv2DTranspose(128, 5, 2, padding='same', activation='relu'), - tf.keras.layers.Conv2DTranspose(128, 5, 1, padding='same', activation='relu'), - tf.keras.layers.Conv2D(3*256, 5, 1, padding='same', activation=None), - tf.keras.layers.Reshape([256, 256, 3, 256]), +decoder = tf.python.keras.Sequential([ + tf.python.keras.layers.Conv2DTranspose(256, 7, 1, padding='valid', activation='relu'), + tf.python.keras.layers.Conv2DTranspose(128, 5, 2, padding='same', activation='relu'), + tf.python.keras.layers.Conv2DTranspose(128, 5, 1, padding='same', activation='relu'), + tf.python.keras.layers.Conv2D(3*256, 5, 1, padding='same', activation=None), + tf.python.keras.layers.Reshape([256, 256, 3, 256]), ed.layers.Categorical(name='image'), ]) @@ -209,7 +209,7 @@ over 64-dimensional sequences. sequence_length, vocab_size = ... # Define the model. -flow = tf.keras.Sequential([ +flow = tf.python.keras.Sequential([ ed.layers.DiscreteAutoregressiveFlow(ed.layers.MADE(vocab_size, hidden_dims=[256, 256])), ed.layers.DiscreteAutoregressiveFlow(ed.layers.MADE(vocab_size, hidden_dims=[256, 256], order='right-to-left')), ed.layers.DiscreteAutoregressiveFlow(ed.layers.MADE(vocab_size, hidden_dims=[256, 256])), diff --git a/edward2/tensorflow/layers/bayesian_linear_model.py b/edward2/tensorflow/layers/bayesian_linear_model.py index 3ecc5a88..d56fd990 100644 --- a/edward2/tensorflow/layers/bayesian_linear_model.py +++ b/edward2/tensorflow/layers/bayesian_linear_model.py @@ -20,7 +20,7 @@ import tensorflow as tf -class BayesianLinearModel(tf.keras.Model): +class BayesianLinearModel(tf.python.keras.Model): r"""Bayesian linear model with standard normal prior over its coefficients. A forward pass computes the mean of the exact predictive distribution diff --git a/edward2/tensorflow/layers/convolutional.py b/edward2/tensorflow/layers/convolutional.py index bbb8ecb6..0364c8c9 100644 --- a/edward2/tensorflow/layers/convolutional.py +++ b/edward2/tensorflow/layers/convolutional.py @@ -30,7 +30,7 @@ @utils.add_weight -class Conv2DReparameterization(tf.keras.layers.Conv2D): +class Conv2DReparameterization(tf.python.keras.layers.Conv2D): """2D convolution layer (e.g. spatial convolution over images). The layer computes a variational Bayesian approximation to the distribution @@ -86,9 +86,9 @@ def __init__(self, def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.kernel_initializer, tf.keras.layers.Layer): + if isinstance(self.kernel_initializer, tf.python.keras.layers.Layer): self.kernel = self.kernel_initializer(self.kernel.shape, self.dtype) - if isinstance(self.bias_initializer, tf.keras.layers.Layer): + if isinstance(self.bias_initializer, tf.python.keras.layers.Layer): self.bias = self.bias_initializer(self.bias.shape, self.dtype) def call(self, *args, **kwargs): @@ -98,7 +98,7 @@ def call(self, *args, **kwargs): @utils.add_weight -class Conv1DReparameterization(tf.keras.layers.Conv1D): +class Conv1DReparameterization(tf.python.keras.layers.Conv1D): """1D convolution layer (e.g. temporal convolution over sequences). The layer computes a variational Bayesian approximation to the distribution @@ -154,9 +154,9 @@ def __init__(self, def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.kernel_initializer, tf.keras.layers.Layer): + if isinstance(self.kernel_initializer, tf.python.keras.layers.Layer): self.kernel = self.kernel_initializer(self.kernel.shape, self.dtype) - if isinstance(self.bias_initializer, tf.keras.layers.Layer): + if isinstance(self.bias_initializer, tf.python.keras.layers.Layer): self.bias = self.bias_initializer(self.bias.shape, self.dtype) def call(self, *args, **kwargs): @@ -430,10 +430,10 @@ def build(self, input_shape): def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.local_scale_initializer, tf.keras.layers.Layer): + if isinstance(self.local_scale_initializer, tf.python.keras.layers.Layer): self.local_scale = self.local_scale_initializer(self.local_scale.shape, self.dtype) - if isinstance(self.global_scale_initializer, tf.keras.layers.Layer): + if isinstance(self.global_scale_initializer, tf.python.keras.layers.Layer): self.global_scale = self.global_scale_initializer(self.global_scale.shape, self.dtype) super().call_weights() @@ -497,7 +497,7 @@ def call(self, inputs, training=None): return super().call(inputs) self.call_weights() if training is None: - training = tf.keras.backend.learning_phase() + training = tf.python.keras.backend.learning_phase() if self._convolution_op is None: padding = self.padding if self.padding == 'causal': @@ -519,15 +519,15 @@ def dropped_inputs(): mean = self.kernel.distribution.mean() log_variance = tf.math.log(self.kernel.distribution.variance()) log_alpha = log_variance - tf.math.log(tf.square(mean) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) log_alpha = tf.clip_by_value(log_alpha, -8., 8.) log_variance = log_alpha + tf.math.log(tf.square(mean) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) means = self._convolution_op(inputs, mean) stddevs = tf.sqrt( self._convolution_op(tf.square(inputs), tf.exp(log_variance)) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) if self.use_bias: if self.data_format == 'channels_first': means = tf.nn.bias_add(means, self.bias, data_format='NCHW') @@ -538,7 +538,7 @@ def dropped_inputs(): outputs = self.activation(outputs) return outputs - # Following tf.keras.Dropout, only apply variational dropout if training + # Following tf.python.keras.Dropout, only apply variational dropout if training # flag is True. training_value = utils.smart_constant_value(training) if training_value is not None: @@ -552,7 +552,7 @@ def dropped_inputs(): false_fn=lambda: super(Conv2DVariationalDropout, self).call(inputs)) -class Conv2DBatchEnsemble(tf.keras.layers.Conv2D): +class Conv2DBatchEnsemble(tf.python.keras.layers.Conv2D): """A batch ensemble convolutional layer.""" def __init__(self, @@ -598,7 +598,7 @@ def __init__(self, self.ensemble_bias_initializer = initializers.get(bias_initializer) self.ensemble_bias_regularizer = regularizers.get(bias_regularizer) self.ensemble_bias_constraint = constraints.get(bias_constraint) - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias def _build_parent(self, input_shape): @@ -707,7 +707,7 @@ def get_config(self): 'ensemble_bias_constraint': constraints.serialize(self.ensemble_bias_constraint), 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, } @@ -727,7 +727,7 @@ def compute_output_shape(self, input_shape): return tf.TensorShape(output_shape) -class Conv1DBatchEnsemble(tf.keras.layers.Conv1D): +class Conv1DBatchEnsemble(tf.python.keras.layers.Conv1D): """A batch ensemble convolutional layer.""" def __init__(self, @@ -771,7 +771,7 @@ def __init__(self, self.ensemble_bias_initializer = initializers.get(bias_initializer) self.ensemble_bias_regularizer = regularizers.get(bias_regularizer) self.ensemble_bias_constraint = constraints.get(bias_constraint) - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias def build(self, input_shape): @@ -845,7 +845,7 @@ def get_config(self): 'ensemble_bias_constraint': constraints.serialize(self.ensemble_bias_constraint), 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, } @@ -918,7 +918,7 @@ def build(self, input_shape): self.built = True -class Conv2DHyperBatchEnsemble(tf.keras.layers.Layer): +class Conv2DHyperBatchEnsemble(tf.python.keras.layers.Layer): """Conv2D Hyper-BatchEnsemble layer that self-tunes hyperparameters. * Image of size (height, width, c) @@ -988,7 +988,7 @@ def __init__(self, self.lambda_key_to_index = lambda_key_to_index self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) - self.activation = tf.keras.activations.get(activation) + self.activation = tf.python.keras.activations.get(activation) self.regularize_fast_weights = regularize_fast_weights self.fast_weights_eq_contraint = fast_weights_eq_contraint @@ -1224,7 +1224,7 @@ def get_config(self): 'data_format': self.data_format, 'activation': - tf.keras.activations.serialize(self.activation), + tf.python.keras.activations.serialize(self.activation), 'use_bias': self.use_bias, 'kernel_initializer': @@ -1286,7 +1286,7 @@ def get_lambda(lambdas, lambda_type, layer_name, lambda_key_to_index): @utils.add_weight -class CondConv2D(tf.keras.layers.Conv2D): +class CondConv2D(tf.python.keras.layers.Conv2D): """2D conditional convolution layer (e.g. spatial convolution over images). This layer extends the base 2D convolution layer to compute example-dependent @@ -1438,7 +1438,7 @@ def build(self, input_shape): else: self.bias = None - self.input_spec = tf.keras.layers.InputSpec( + self.input_spec = tf.python.keras.layers.InputSpec( ndim=self.rank + 2, axes={channel_axis: input_dim}) self.built = True @@ -1503,7 +1503,7 @@ def _get_padding_op(self): @utils.add_weight -class DepthwiseCondConv2D(tf.keras.layers.DepthwiseConv2D): +class DepthwiseCondConv2D(tf.python.keras.layers.DepthwiseConv2D): """Depthwise separable 2D conditional convolution layer. This layer extends the base depthwise 2D convolution layer to compute @@ -1656,7 +1656,7 @@ def build(self, input_shape): else: self.bias = None # Set input spec. - self.input_spec = tf.keras.layers.InputSpec( + self.input_spec = tf.python.keras.layers.InputSpec( ndim=4, axes={channel_axis: input_dim}) self.built = True @@ -1711,7 +1711,7 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) -class DepthwiseConv2DBatchEnsemble(tf.keras.layers.DepthwiseConv2D): +class DepthwiseConv2DBatchEnsemble(tf.python.keras.layers.DepthwiseConv2D): """Batch ensemble of depthwise separable 2D convolutions.""" def __init__(self, @@ -1755,7 +1755,7 @@ def __init__(self, self.ensemble_bias_initializer = initializers.get(bias_initializer) self.ensemble_bias_regularizer = regularizers.get(bias_regularizer) self.ensemble_bias_constraint = constraints.get(bias_constraint) - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias def build(self, input_shape): @@ -1835,7 +1835,7 @@ def get_config(self): 'ensemble_bias_constraint': constraints.serialize(self.ensemble_bias_constraint), 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, } @@ -1845,7 +1845,7 @@ def get_config(self): @utils.add_weight -class Conv1DRank1(tf.keras.layers.Conv1D): +class Conv1DRank1(tf.python.keras.layers.Conv1D): """A rank-1 Bayesian NN 1D convolution layer (Dusenberry et al., 2020). The argument ensemble_size selects the number of mixture components over all @@ -1902,7 +1902,7 @@ def __init__(self, activity_regularizer=activity_regularizer, kernel_constraint=kernel_constraint, **kwargs) - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) @@ -1967,7 +1967,7 @@ def call(self, inputs): examples_per_model = batch_size // self.ensemble_size # Sample parameters for each example. - if isinstance(self.alpha_initializer, tf.keras.layers.Layer): + if isinstance(self.alpha_initializer, tf.python.keras.layers.Layer): alpha = tf.clip_by_value( self.alpha_initializer( self.alpha_shape, @@ -1977,7 +1977,7 @@ def call(self, inputs): alpha = tf.transpose(alpha, [1, 0, 2]) else: alpha = tf.tile(self.alpha, [1, examples_per_model]) - if isinstance(self.gamma_initializer, tf.keras.layers.Layer): + if isinstance(self.gamma_initializer, tf.python.keras.layers.Layer): gamma = tf.clip_by_value( self.gamma_initializer( self.gamma_shape, @@ -1999,7 +1999,7 @@ def call(self, inputs): outputs = super().call(inputs * alpha) * gamma if self.use_ensemble_bias: - if isinstance(self.ensemble_bias_initializer, tf.keras.layers.Layer): + if isinstance(self.ensemble_bias_initializer, tf.python.keras.layers.Layer): bias = self.ensemble_bias_initializer( self.ensemble_bias_shape, self.dtype).distribution.sample(examples_per_model) @@ -2016,7 +2016,7 @@ def call(self, inputs): def get_config(self): config = { 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, 'alpha_initializer': @@ -2048,7 +2048,7 @@ def get_config(self): @utils.add_weight -class Conv2DRank1(tf.keras.layers.Conv2D): +class Conv2DRank1(tf.python.keras.layers.Conv2D): """A rank-1 Bayesian NN 2D convolution layer (Dusenberry et al., 2020). The argument ensemble_size selects the number of mixture components over all @@ -2107,7 +2107,7 @@ def __init__(self, kernel_constraint=kernel_constraint, bias_constraint=None, **kwargs) - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) @@ -2172,7 +2172,7 @@ def call(self, inputs): examples_per_model = batch_size // self.ensemble_size # Sample parameters for each example. - if isinstance(self.alpha_initializer, tf.keras.layers.Layer): + if isinstance(self.alpha_initializer, tf.python.keras.layers.Layer): alpha = tf.clip_by_value( self.alpha_initializer( self.alpha_shape, @@ -2182,7 +2182,7 @@ def call(self, inputs): alpha = tf.transpose(alpha, [1, 0, 2]) else: alpha = tf.tile(self.alpha, [1, examples_per_model]) - if isinstance(self.gamma_initializer, tf.keras.layers.Layer): + if isinstance(self.gamma_initializer, tf.python.keras.layers.Layer): gamma = tf.clip_by_value( self.gamma_initializer( self.gamma_shape, @@ -2206,7 +2206,7 @@ def call(self, inputs): outputs = super().call(inputs * alpha) * gamma if self.use_ensemble_bias: - if isinstance(self.ensemble_bias_initializer, tf.keras.layers.Layer): + if isinstance(self.ensemble_bias_initializer, tf.python.keras.layers.Layer): bias = self.ensemble_bias_initializer( self.ensemble_bias_shape, self.dtype).distribution.sample(examples_per_model) @@ -2225,7 +2225,7 @@ def call(self, inputs): def get_config(self): config = { 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, 'alpha_initializer': diff --git a/edward2/tensorflow/layers/convolutional_test.py b/edward2/tensorflow/layers/convolutional_test.py index cfca8d29..459a546c 100644 --- a/edward2/tensorflow/layers/convolutional_test.py +++ b/edward2/tensorflow/layers/convolutional_test.py @@ -79,7 +79,7 @@ def testConv2DKernel(self, kernel_initializer, bias_initializer, all_close): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5, 4, 4, 12).astype(np.float32) model = layer(4, kernel_size=2, @@ -105,10 +105,10 @@ def testConv2DKernel(self, ) def testConv2DModel(self, layer): inputs = np.random.rand(3, 4, 4, 1).astype(np.float32) - model = tf.keras.Sequential([ + model = tf.python.keras.Sequential([ layer(3, kernel_size=2, padding="SAME", activation="relu"), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(2, activation=None), + tf.python.keras.layers.Flatten(), + tf.python.keras.layers.Dense(2, activation=None), ]) outputs = model(inputs, training=True) self.assertEqual(outputs.shape, (3, 2)) @@ -147,7 +147,7 @@ def testConv2DModel(self, layer): ) def testConv1DKernel(self, layer, kernel_initializer, bias_initializer, all_close): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5, 4, 12).astype(np.float32) model = layer( 4, @@ -172,10 +172,10 @@ def testConv1DKernel(self, layer, kernel_initializer, bias_initializer, ) def testConv1DModel(self, layer): inputs = np.random.rand(3, 4, 1).astype(np.float32) - model = tf.keras.Sequential([ + model = tf.python.keras.Sequential([ layer(3, kernel_size=2, padding="SAME", activation="relu"), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(2, activation=None), + tf.python.keras.layers.Flatten(), + tf.python.keras.layers.Dense(2, activation=None), ]) outputs = model(inputs, training=True) self.assertEqual(outputs.shape, (3, 2)) @@ -265,7 +265,7 @@ def testConv1DBatchEnsemble(self): "gamma_initializer": "trainable_deterministic"}, ) def testConv2DRank1BatchEnsemble(self, alpha_initializer, gamma_initializer): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 3 examples_per_model = 4 input_dim = 5 @@ -467,7 +467,7 @@ def testConv2DRank1AlphaGamma(self, all_close, use_additive_perturbation, ensemble_size): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5*ensemble_size, 4, 4, 12).astype(np.float32) model = ed.layers.Conv2DRank1( 4, @@ -491,7 +491,7 @@ def testConv2DRank1AlphaGamma(self, "gamma_initializer": "trainable_deterministic"}, ) def testConv1DRank1BatchEnsemble(self, alpha_initializer, gamma_initializer): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 3 examples_per_model = 4 input_dim = 5 @@ -592,7 +592,7 @@ def testConv1DRank1AlphaGamma(self, all_close, use_additive_perturbation, ensemble_size): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5*ensemble_size, 4, 12).astype(np.float32) model = ed.layers.Conv1DRank1( 4, diff --git a/edward2/tensorflow/layers/dense.py b/edward2/tensorflow/layers/dense.py index c61df373..142d0e70 100644 --- a/edward2/tensorflow/layers/dense.py +++ b/edward2/tensorflow/layers/dense.py @@ -30,7 +30,7 @@ @utils.add_weight -class DenseReparameterization(tf.keras.layers.Dense): +class DenseReparameterization(tf.python.keras.layers.Dense): """Bayesian densely-connected layer estimated via reparameterization. The layer computes a variational Bayesian approximation to the distribution @@ -72,9 +72,9 @@ def __init__(self, def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.kernel_initializer, tf.keras.layers.Layer): + if isinstance(self.kernel_initializer, tf.python.keras.layers.Layer): self.kernel = self.kernel_initializer(self.kernel.shape, self.dtype) - if isinstance(self.bias_initializer, tf.keras.layers.Layer): + if isinstance(self.bias_initializer, tf.python.keras.layers.Layer): self.bias = self.bias_initializer(self.bias.shape, self.dtype) def call(self, *args, **kwargs): @@ -103,7 +103,7 @@ class DenseDVI(DenseReparameterization): respectively. ```python - model = tf.keras.Sequential([ + model = tf.python.keras.Sequential([ ed.layers.DenseDVI(256, activation=tf.nn.relu), ed.layers.DenseDVI(256, activation=tf.nn.relu), ed.layers.DenseDVI(1, activation=None), @@ -162,21 +162,21 @@ def call(self, inputs): covariance = tf.linalg.set_diag( covariance, tf.linalg.diag_part(covariance) + covariance_diag) - if self.activation in (tf.keras.activations.relu, tf.nn.relu): + if self.activation in (tf.python.keras.activations.relu, tf.nn.relu): # Compute activation's moments with variable names from Wu et al. (2018). variance = tf.linalg.diag_part(covariance) scale = tf.sqrt(variance) - mu = mean / (scale + tf.keras.backend.epsilon()) + mu = mean / (scale + tf.python.keras.backend.epsilon()) mean = scale * soft_relu(mu) pairwise_variances = (tf.expand_dims(variance, -1) * tf.expand_dims(variance, -2)) # [..., units, units] rho = covariance / tf.sqrt(pairwise_variances + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) rho = tf.clip_by_value(rho, - -1. / (1. + tf.keras.backend.epsilon()), - 1. / (1. + tf.keras.backend.epsilon())) - s = covariance / (rho + tf.keras.backend.epsilon()) + -1. / (1. + tf.python.keras.backend.epsilon()), + 1. / (1. + tf.python.keras.backend.epsilon())) + s = covariance / (rho + tf.python.keras.backend.epsilon()) mu1 = tf.expand_dims(mu, -1) # [..., units, 1] mu2 = tf.linalg.matrix_transpose(mu1) # [..., 1, units] a = (soft_relu(mu1) * soft_relu(mu2) + @@ -187,13 +187,13 @@ def call(self, inputs): gr = gh + rho / (1. + bar_rho) # Include numerically stable versions of gr and rho when multiplying or # dividing them. The sign of gr*rho and rho/gr is always positive. - safe_gr = tf.abs(gr) + 0.5 * tf.keras.backend.epsilon() - safe_rho = tf.abs(rho) + tf.keras.backend.epsilon() + safe_gr = tf.abs(gr) + 0.5 * tf.python.keras.backend.epsilon() + safe_rho = tf.abs(rho) + tf.python.keras.backend.epsilon() exp_negative_q = gr / (2. * math.pi) * tf.exp( -safe_rho / (2. * safe_gr * (1 + bar_rho)) + (gh - rho) / (safe_gr * safe_rho) * mu1 * mu2) covariance = s * (a + exp_negative_q) - elif self.activation not in (tf.keras.activations.linear, None): + elif self.activation not in (tf.python.keras.activations.linear, None): raise NotImplementedError('Activation is {}. Deterministic variational ' 'inference is only available if activation is ' 'ReLU or None.'.format(self.activation)) @@ -318,7 +318,7 @@ def call(self, inputs, training=None): return super().call(inputs) self.call_weights() if training is None: - training = tf.keras.backend.learning_phase() + training = tf.python.keras.backend.learning_phase() def dropped_inputs(): """Forward pass with dropout.""" @@ -328,21 +328,21 @@ def dropped_inputs(): mean = self.kernel.distribution.mean() log_variance = tf.math.log(self.kernel.distribution.variance()) log_alpha = log_variance - tf.math.log(tf.square(mean) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) log_alpha = tf.clip_by_value(log_alpha, -8., 8.) log_variance = log_alpha + tf.math.log(tf.square(mean) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) if inputs.shape.ndims <= 2: means = tf.matmul(inputs, mean) stddevs = tf.sqrt( tf.matmul(tf.square(inputs), tf.exp(log_variance)) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) else: means = tf.tensordot(inputs, mean, [[-1], [0]]) stddevs = tf.sqrt( tf.tensordot(tf.square(inputs), tf.exp(log_variance), [[-1], [0]]) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) if self.use_bias: means = tf.nn.bias_add(means, self.bias) outputs = generated_random_variables.Normal(loc=means, scale=stddevs) @@ -350,7 +350,7 @@ def dropped_inputs(): outputs = self.activation(outputs) return outputs - # Following tf.keras.Dropout, only apply variational dropout if training + # Following tf.python.keras.Dropout, only apply variational dropout if training # flag is True. training_value = utils.smart_constant_value(training) if training_value is not None: @@ -451,10 +451,10 @@ def build(self, input_shape): def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.local_scale_initializer, tf.keras.layers.Layer): + if isinstance(self.local_scale_initializer, tf.python.keras.layers.Layer): self.local_scale = self.local_scale_initializer(self.local_scale.shape, self.dtype) - if isinstance(self.global_scale_initializer, tf.keras.layers.Layer): + if isinstance(self.global_scale_initializer, tf.python.keras.layers.Layer): self.global_scale = self.global_scale_initializer(self.global_scale.shape, self.dtype) super().call_weights() @@ -467,7 +467,7 @@ def call(self, inputs, training=None): return super().call(inputs, training=training) -class DenseBatchEnsemble(tf.keras.layers.Dense): +class DenseBatchEnsemble(tf.python.keras.layers.Dense): """A batch ensemble dense layer.""" def __init__(self, @@ -503,7 +503,7 @@ def __init__(self, self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) self.use_ensemble_bias = use_bias - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.ensemble_bias_initializer = initializers.get(bias_initializer) self.ensemble_bias_regularizer = regularizers.get(bias_regularizer) self.ensemble_bias_constraint = constraints.get(bias_constraint) @@ -585,7 +585,7 @@ def get_config(self): 'ensemble_size': self.ensemble_size, 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, 'alpha_initializer': @@ -659,7 +659,7 @@ def build(self, input_shape): self.built = True -class DenseHyperBatchEnsemble(tf.keras.layers.Layer): +class DenseHyperBatchEnsemble(tf.python.keras.layers.Layer): """Dense Hyper-BatchEnsemble layer that self-tunes hyperparameters. * W, W' of size (d_in, d_out) @@ -724,7 +724,7 @@ def __init__(self, self.lambda_key_to_index = lambda_key_to_index self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) - self.activation = tf.keras.activations.get(activation) + self.activation = tf.python.keras.activations.get(activation) self.regularize_fast_weights = regularize_fast_weights self.fast_weights_eq_contraint = fast_weights_eq_contraint @@ -919,7 +919,7 @@ def get_config(self): 'ensemble_size': self.ensemble_size, 'activation': - tf.keras.activations.serialize(self.activation), + tf.python.keras.activations.serialize(self.activation), 'use_bias': self.use_bias, 'alpha_initializer': @@ -985,7 +985,7 @@ def get_lambda(lambdas, lambda_type, layer_name, lambda_key_to_index): @utils.add_weight -class DenseRank1(tf.keras.layers.Dense): +class DenseRank1(tf.python.keras.layers.Dense): """A rank-1 Bayesian neural net dense layer (Dusenberry et al., 2020). The argument ensemble_size selects the number of mixture components over all @@ -1036,7 +1036,7 @@ def __init__(self, bias_constraint=None, **kwargs) self.units = units - self.ensemble_activation = tf.keras.activations.get(activation) + self.ensemble_activation = tf.python.keras.activations.get(activation) self.use_ensemble_bias = use_bias self.alpha_initializer = initializers.get(alpha_initializer) self.gamma_initializer = initializers.get(gamma_initializer) @@ -1099,7 +1099,7 @@ def call(self, inputs): inputs, [self.ensemble_size, examples_per_model, input_dim]) # Sample parameters for each example. - if isinstance(self.alpha_initializer, tf.keras.layers.Layer): + if isinstance(self.alpha_initializer, tf.python.keras.layers.Layer): alpha = tf.clip_by_value( self.alpha_initializer( self.alpha_shape, @@ -1109,7 +1109,7 @@ def call(self, inputs): alpha = tf.transpose(alpha, [1, 0, 2]) else: alpha = tf.expand_dims(self.alpha, 1) - if isinstance(self.gamma_initializer, tf.keras.layers.Layer): + if isinstance(self.gamma_initializer, tf.python.keras.layers.Layer): gamma = tf.clip_by_value( self.gamma_initializer( self.gamma_shape, @@ -1126,7 +1126,7 @@ def call(self, inputs): outputs = super().call(inputs * alpha) * gamma if self.use_ensemble_bias: - if isinstance(self.ensemble_bias_initializer, tf.keras.layers.Layer): + if isinstance(self.ensemble_bias_initializer, tf.python.keras.layers.Layer): bias = self.ensemble_bias_initializer( self.ensemble_bias_shape, self.dtype).distribution.sample(examples_per_model) @@ -1143,7 +1143,7 @@ def call(self, inputs): def get_config(self): config = { 'ensemble_activation': - tf.keras.activations.serialize(self.ensemble_activation), + tf.python.keras.activations.serialize(self.ensemble_activation), 'use_ensemble_bias': self.use_ensemble_bias, 'alpha_initializer': @@ -1173,7 +1173,7 @@ def get_config(self): @utils.add_weight -class CondDense(tf.keras.layers.Dense): +class CondDense(tf.python.keras.layers.Dense): """Conditional dense layer. This layer extends the base dense layer to compute example-dependent diff --git a/edward2/tensorflow/layers/dense_test.py b/edward2/tensorflow/layers/dense_test.py index 99727a11..7194b4b4 100644 --- a/edward2/tensorflow/layers/dense_test.py +++ b/edward2/tensorflow/layers/dense_test.py @@ -87,7 +87,7 @@ def testDenseKernel(self, kernel_initializer, bias_initializer, all_close): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5, 3, 12).astype(np.float32) model = layer(4, kernel_initializer=kernel_initializer, @@ -112,7 +112,7 @@ def testDenseKernel(self, ) def testDenseMean(self, layer): """Tests that forward pass can use other values, e.g., posterior mean.""" - tf.keras.backend.set_learning_phase(0) # test time + tf.python.keras.backend.set_learning_phase(0) # test time def take_mean(f, *args, **kwargs): """Sets random variable value to its mean.""" rv = f(*args, **kwargs) @@ -136,7 +136,7 @@ def take_mean(f, *args, **kwargs): {"layer": ed.layers.DenseHierarchical}, ) def testDenseLoss(self, layer): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time features = np.random.rand(5, 12).astype(np.float32) labels = np.random.rand(5, 10).astype(np.float32) model = layer(10) @@ -145,7 +145,7 @@ def testDenseLoss(self, layer): with tf.GradientTape(persistent=True) as tape: predictions = model(features) # first call forces build model(features) # ensure robustness after multiple calls - nll = tf.keras.losses.mean_squared_error(labels, predictions) + nll = tf.python.keras.losses.mean_squared_error(labels, predictions) kl = sum(model.losses) variables = [model.kernel_initializer.mean, model.kernel_initializer.stddev] @@ -166,7 +166,7 @@ def testDenseLoss(self, layer): # Imagine this is the 2nd epoch. with tf.GradientTape(persistent=True) as tape: predictions = model(features) # build is not called - nll = tf.keras.losses.mean_squared_error(labels, predictions) + nll = tf.python.keras.losses.mean_squared_error(labels, predictions) kl = sum(model.losses) variables = [model.kernel_initializer.mean, model.kernel_initializer.stddev] @@ -195,12 +195,12 @@ def testDenseLoss(self, layer): ) def testDenseModel(self, layer): inputs = np.random.rand(3, 4, 4, 1).astype(np.float32) - model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(3, + model = tf.python.keras.Sequential([ + tf.python.keras.layers.Conv2D(3, kernel_size=2, padding="SAME", activation=tf.nn.relu), - tf.keras.layers.Flatten(), + tf.python.keras.layers.Flatten(), layer(2, activation=None), ]) outputs = model(inputs, training=True) @@ -224,12 +224,12 @@ class DenseSubclass(layer): pass inputs = np.random.rand(3, 4, 4, 1).astype(np.float32) - model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(3, + model = tf.python.keras.Sequential([ + tf.python.keras.layers.Conv2D(3, kernel_size=2, padding="SAME", activation=tf.nn.relu), - tf.keras.layers.Flatten(), + tf.python.keras.layers.Flatten(), DenseSubclass(2, activation=None), ]) outputs = model(inputs, training=True) @@ -243,7 +243,7 @@ def testDenseDVIIsDeterministic(self): """Tests that DenseDVI network has a deterministic loss function.""" features = np.random.rand(3, 2).astype(np.float32) labels = np.random.rand(3, 1).astype(np.float32) - model = tf.keras.Sequential([ + model = tf.python.keras.Sequential([ ed.layers.DenseDVI(5, activation=tf.nn.relu), ed.layers.DenseDVI(1, activation=None), ]) @@ -288,7 +288,7 @@ def testDenseDVIMoments(self): def testDenseBatchEnsemble(self): """Tests that vectorized implementation is same as for loop.""" - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 3 examples_per_model = 4 input_dim = 5 @@ -431,7 +431,7 @@ def testDenseRank1BatchEnsemble(self, alpha_initializer, gamma_initializer, bias_initializer): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 3 examples_per_model = 4 input_dim = 5 @@ -532,7 +532,7 @@ def testDenseRank1AlphaGamma(self, all_close, use_additive_perturbation, ensemble_size): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time inputs = np.random.rand(5*ensemble_size, 12).astype(np.float32) model = ed.layers.DenseRank1( 4, @@ -550,7 +550,7 @@ def testDenseRank1AlphaGamma(self, model.get_config() def testCondConv(self): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time features = np.random.rand(5, 12).astype(np.float32) routing_weights = np.random.rand(5, 3).astype(np.float32) model = ed.layers.CondDense(10, num_experts=3) diff --git a/edward2/tensorflow/layers/discrete_flows.py b/edward2/tensorflow/layers/discrete_flows.py index 6deda96e..f06b84bb 100644 --- a/edward2/tensorflow/layers/discrete_flows.py +++ b/edward2/tensorflow/layers/discrete_flows.py @@ -22,7 +22,7 @@ # TODO(trandustin): Move Reverse to another module(?). -class Reverse(tf.keras.layers.Layer): +class Reverse(tf.python.keras.layers.Layer): """Swaps the forward and reverse transformations of a layer.""" def __init__(self, reversible_layer, **kwargs): @@ -34,7 +34,7 @@ def __init__(self, reversible_layer, **kwargs): self.reverse = reversible_layer.call -class DiscreteAutoregressiveFlow(tf.keras.layers.Layer): +class DiscreteAutoregressiveFlow(tf.python.keras.layers.Layer): """A discrete reversible layer. The flow takes as input a one-hot Tensor of shape `[..., length, vocab_size]`. @@ -238,7 +238,7 @@ def log_det_jacobian(self, inputs): return tf.cast(0, inputs.dtype) -class DiscreteBipartiteFlow(tf.keras.layers.Layer): +class DiscreteBipartiteFlow(tf.python.keras.layers.Layer): """A discrete reversible layer. The flow takes as input a one-hot Tensor of shape `[..., length, vocab_size]`. @@ -370,7 +370,7 @@ def log_det_jacobian(self, inputs): return tf.cast(0, inputs.dtype) -class SinkhornAutoregressiveFlow(tf.keras.layers.Layer): +class SinkhornAutoregressiveFlow(tf.python.keras.layers.Layer): """A discrete reversible layer using Sinkhorn normalization for permutations. The flow takes as input a one-hot Tensor of shape `[..., length, vocab_size]`. diff --git a/edward2/tensorflow/layers/embeddings.py b/edward2/tensorflow/layers/embeddings.py index 4f66e7b8..44b8e13c 100644 --- a/edward2/tensorflow/layers/embeddings.py +++ b/edward2/tensorflow/layers/embeddings.py @@ -24,7 +24,7 @@ @utils.add_weight -class EmbeddingReparameterization(tf.keras.layers.Embedding): +class EmbeddingReparameterization(tf.python.keras.layers.Embedding): """Bayesian embedding layer estimated via reparameterization. The layer computes a variational Bayesian approximation to the distribution @@ -91,7 +91,7 @@ def __init__(self, def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.embeddings_initializer, tf.keras.layers.Layer): + if isinstance(self.embeddings_initializer, tf.python.keras.layers.Layer): self.embeddings = self.embeddings_initializer(self.embeddings.shape, self.dtype) diff --git a/edward2/tensorflow/layers/embeddings_test.py b/edward2/tensorflow/layers/embeddings_test.py index 48712932..2a0b1a17 100644 --- a/edward2/tensorflow/layers/embeddings_test.py +++ b/edward2/tensorflow/layers/embeddings_test.py @@ -64,15 +64,15 @@ def testEmbedding(self, embeddings_initializer, all_close): def testEmbeddingModel(self, embeddings_initializer, embeddings_regularizer, all_close, num_losses): model_output_dim = 2 - model = tf.keras.Sequential([ + model = tf.python.keras.Sequential([ ed.layers.EmbeddingReparameterization( self.input_dim, output_dim=self.output_dim, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer), - tf.keras.layers.RNN(tf.keras.layers.LSTMCell(5)), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(model_output_dim), + tf.python.keras.layers.RNN(tf.python.keras.layers.LSTMCell(5)), + tf.python.keras.layers.Flatten(), + tf.python.keras.layers.Dense(model_output_dim), ]) outputs1 = model(self.inputs, training=True) outputs2 = model(self.inputs, training=True) diff --git a/edward2/tensorflow/layers/gaussian_process.py b/edward2/tensorflow/layers/gaussian_process.py index a2180bf8..fe0deab1 100644 --- a/edward2/tensorflow/layers/gaussian_process.py +++ b/edward2/tensorflow/layers/gaussian_process.py @@ -104,11 +104,11 @@ def get_config(self): return { 'variance': self.variance, 'bias': self.bias, - 'encoder': tf.keras.utils.serialize_keras_object(self.encoder), + 'encoder': tf.python.keras.utils.serialize_keras_object(self.encoder), } -class GaussianProcess(tf.keras.layers.Layer): +class GaussianProcess(tf.python.keras.layers.Layer): r"""Gaussian process layer. The layer represents a distribution over functions, where a @@ -170,7 +170,7 @@ def __init__( self.conditional_outputs = conditional_outputs self.supports_masking = True - self.input_spec = tf.keras.layers.InputSpec(min_ndim=2) + self.input_spec = tf.python.keras.layers.InputSpec(min_ndim=2) def build(self, input_shape=None): # Don't track trainable variables such as in the kernel. The user should @@ -190,7 +190,7 @@ def call(self, inputs): knm = self.covariance_fn(inputs, self.conditional_inputs) kmm = self.covariance_fn(self.conditional_inputs, self.conditional_inputs) kmm = tf.linalg.set_diag( - kmm, tf.linalg.diag_part(kmm) + tf.keras.backend.epsilon()) + kmm, tf.linalg.diag_part(kmm) + tf.python.keras.backend.epsilon()) kmm_tril = tf.linalg.cholesky(kmm) kmm_tril_operator = tf.linalg.LinearOperatorLowerTriangular(kmm_tril) knm_operator = tf.linalg.LinearOperatorFullMatrix(knm) @@ -216,7 +216,7 @@ def call(self, inputs): covariance_matrix = tf.linalg.set_diag( covariance_matrix, - tf.linalg.diag_part(covariance_matrix) + tf.keras.backend.epsilon()) + tf.linalg.diag_part(covariance_matrix) + tf.python.keras.backend.epsilon()) # Form a multivariate normal random variable with batch_shape units and # event_shape batch_size. Then make it be independent across the units @@ -250,8 +250,8 @@ def compute_output_shape(self, input_shape): def get_config(self): config = { 'units': self.units, - 'mean_fn': tf.keras.utils.serialize_keras_object(self.mean_fn), - 'covariance_fn': tf.keras.utils.serialize_keras_object( + 'mean_fn': tf.python.keras.utils.serialize_keras_object(self.mean_fn), + 'covariance_fn': tf.python.keras.utils.serialize_keras_object( self.covariance_fn), 'conditional_inputs': None, # don't serialize as it can be large 'conditional_outputs': None, # don't serialize as it can be large @@ -310,8 +310,8 @@ class SparseGaussianProcess(GaussianProcess): dataset_size = 10000 features, labels = load_spatial_data(batch_size) - model = tf.keras.Sequential([ - tf.keras.layers.Flatten(), + model = tf.python.keras.Sequential([ + tf.python.keras.layers.Flatten(), layers.SparseGaussianProcess(256, num_inducing=512), layers.SparseGaussianProcess(256, num_inducing=512), layers.SparseGaussianProcess(10, num_inducing=512), @@ -404,10 +404,10 @@ def build(self, input_shape=None): def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.inducing_inputs_initializer, tf.keras.layers.Layer): + if isinstance(self.inducing_inputs_initializer, tf.python.keras.layers.Layer): self.conditional_inputs = self.inducing_inputs_initializer( self.conditional_inputs.shape, self.dtype) - if isinstance(self.inducing_outputs_initializer, tf.keras.layers.Layer): + if isinstance(self.inducing_outputs_initializer, tf.python.keras.layers.Layer): self.conditional_outputs = self.inducing_outputs_initializer( self.conditional_outputs.shape, self.dtype) diff --git a/edward2/tensorflow/layers/gaussian_process_test.py b/edward2/tensorflow/layers/gaussian_process_test.py index 573e8c09..3f7dd65b 100644 --- a/edward2/tensorflow/layers/gaussian_process_test.py +++ b/edward2/tensorflow/layers/gaussian_process_test.py @@ -48,8 +48,8 @@ def testGaussianProcessPrior(self): output_dim = 5 features = np.random.rand(batch_size, input_dim).astype(np.float32) labels = np.random.rand(batch_size, output_dim).astype(np.float32) - model = tf.keras.Sequential([ - tf.keras.layers.Dense(2, activation=None), + model = tf.python.keras.Sequential([ + tf.python.keras.layers.Dense(2, activation=None), ed.layers.GaussianProcess(output_dim), ]) outputs = model(features) diff --git a/edward2/tensorflow/layers/heteroscedastic.py b/edward2/tensorflow/layers/heteroscedastic.py index 2b1b728c..9f4bffc8 100644 --- a/edward2/tensorflow/layers/heteroscedastic.py +++ b/edward2/tensorflow/layers/heteroscedastic.py @@ -22,7 +22,7 @@ MIN_SCALE_MONTE_CARLO = 1e-3 -class MCSoftmaxOutputLayerBase(tf.keras.layers.Layer): +class MCSoftmaxOutputLayerBase(tf.python.keras.layers.Layer): """Base class for MC heteroscesastic output layers. Collier, M., Mustafa, B., Kokiopoulou, E., Jenatton, R., & Berent, J. (2020). @@ -55,7 +55,7 @@ def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, due to dynamic shape inference, setting = True may solve. logits_only: Boolean. If True, only return the logits from the __call__ method. Useful when a single output Tensor is required e.g. - tf.keras.Sequential models require a single output Tensor. + tf.python.keras.Sequential models require a single output Tensor. eps: Float. Clip probabilities into [eps, 1.0] softmax or [eps, 1.0 - eps] sigmoid before applying log (softmax), or inverse sigmoid. @@ -305,10 +305,10 @@ def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, """Creates an instance of MCSoftmaxDense. This is a MC softmax heteroscedastic drop in replacement for a - tf.keras.layers.Dense output layer. e.g. simply change: + tf.python.keras.layers.Dense output layer. e.g. simply change: ```python - logits = tf.keras.layers.Dense(...)(x) + logits = tf.python.keras.layers.Dense(...)(x) ``` to @@ -338,7 +338,7 @@ def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, are shared across batch elements. If encountering XLA compilation errors due to dynamic shape inference setting = True may solve. logits_only: Boolean. If True, only return the logits from the __call__ - method. Set True to serialize tf.keras.Sequential models. + method. Set True to serialize tf.python.keras.Sequential models. eps: Float. Clip probabilities into [eps, 1.0] before applying log. dtype: Tensorflow dtype. The dtype of output Tensor and weights associated with the layer. @@ -360,10 +360,10 @@ def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, share_samples_across_batch=share_samples_across_batch, logits_only=logits_only, eps=eps, name=name) - self._loc_layer = tf.keras.layers.Dense( + self._loc_layer = tf.python.keras.layers.Dense( 1 if num_classes == 2 else num_classes, activation=None, kernel_regularizer=loc_regularizer, name='loc_layer', dtype=dtype) - self._scale_layer = tf.keras.layers.Dense( + self._scale_layer = tf.python.keras.layers.Dense( 1 if num_classes == 2 else num_classes, activation=tf.math.softplus, name='scale_layer', dtype=dtype) @@ -391,8 +391,8 @@ def _compute_scale_param(self, inputs): def get_config(self): config = { - 'loc_layer': tf.keras.layers.serialize(self._loc_layer), - 'scale_layer': tf.keras.layers.serialize(self._scale_layer), + 'loc_layer': tf.python.keras.layers.serialize(self._loc_layer), + 'scale_layer': tf.python.keras.layers.serialize(self._scale_layer), } new_config = super().get_config() new_config.update(config) @@ -426,10 +426,10 @@ def __init__(self, num_classes, num_factors, temperature=1.0, num_factors << num_classes => approx to sampling ~ N(mu(x), sigma(x)) This is a MC softmax heteroscedastic drop in replacement for a - tf.keras.layers.Dense output layer. e.g. simply change: + tf.python.keras.layers.Dense output layer. e.g. simply change: ```python - logits = tf.keras.layers.Dense(...)(x) + logits = tf.python.keras.layers.Dense(...)(x) ``` to @@ -465,7 +465,7 @@ def __init__(self, num_classes, num_factors, temperature=1.0, are shared across batch elements. If encountering XLA compilation errors due to dynamic shape inference setting = True may solve. logits_only: Boolean. If True, only return the logits from the __call__ - method. Set True to serialize tf.keras.Sequential models. + method. Set True to serialize tf.python.keras.Sequential models. eps: Float. Clip probabilities into [eps, 1.0] before applying log. dtype: Tensorflow dtype. The dtype of output Tensor and weights associated with the layer. @@ -490,18 +490,18 @@ def __init__(self, num_classes, num_factors, temperature=1.0, self._parameter_efficient = parameter_efficient if parameter_efficient: - self._scale_layer_homoscedastic = tf.keras.layers.Dense( + self._scale_layer_homoscedastic = tf.python.keras.layers.Dense( num_classes, name='scale_layer_homoscedastic', dtype=dtype) - self._scale_layer_heteroscedastic = tf.keras.layers.Dense( + self._scale_layer_heteroscedastic = tf.python.keras.layers.Dense( num_classes, name='scale_layer_heteroscedastic', dtype=dtype) else: - self._scale_layer = tf.keras.layers.Dense( + self._scale_layer = tf.python.keras.layers.Dense( num_classes * num_factors, name='scale_layer', dtype=dtype) - self._loc_layer = tf.keras.layers.Dense( + self._loc_layer = tf.python.keras.layers.Dense( num_classes, kernel_regularizer=loc_regularizer, name='loc_layer', dtype=dtype) - self._diag_layer = tf.keras.layers.Dense( + self._diag_layer = tf.python.keras.layers.Dense( num_classes, activation=tf.math.softplus, name='diag_layer', dtype=dtype) @@ -652,12 +652,12 @@ def get_config(self): } if self._parameter_efficient: - config['scale_layer_homoscedastic'] = tf.keras.layers.serialize( + config['scale_layer_homoscedastic'] = tf.python.keras.layers.serialize( self._scale_layer_homoscedastic) - config['scale_layer_heteroscedastic'] = tf.keras.layers.serialize( + config['scale_layer_heteroscedastic'] = tf.python.keras.layers.serialize( self._scale_layer_heteroscedastic) else: - config['scale_layer'] = tf.keras.layers.serialize(self._scale_layer) + config['scale_layer'] = tf.python.keras.layers.serialize(self._scale_layer) new_config = super().get_config() new_config.update(config) @@ -691,10 +691,10 @@ def __init__(self, num_outputs, num_factors=0, temperature=1.0, num_factors << num_outputs => approx to sampling ~ N(mu(x), sigma(x)). This is a heteroscedastic drop in replacement for a - tf.keras.layers.Dense output layer. e.g. simply change: + tf.python.keras.layers.Dense output layer. e.g. simply change: ```python - logits = tf.keras.layers.Dense(...)(x) + logits = tf.python.keras.layers.Dense(...)(x) ``` to @@ -731,7 +731,7 @@ def __init__(self, num_outputs, num_factors=0, temperature=1.0, are shared across batch elements. If encountering XLA compilation errors due to dynamic shape inference setting = True may solve. logits_only: Boolean. If True, only return the logits from the __call__ - method. Set True to serialize tf.keras.Sequential models. + method. Set True to serialize tf.python.keras.Sequential models. eps: Float. Clip probabilities into [eps, 1.0 - eps] before applying inverse sigmoid. dtype: Tensorflow dtype. The dtype of output Tensor and weights associated @@ -755,21 +755,21 @@ def __init__(self, num_outputs, num_factors=0, temperature=1.0, self._parameter_efficient = parameter_efficient self._num_outputs = num_outputs - self._loc_layer = tf.keras.layers.Dense( + self._loc_layer = tf.python.keras.layers.Dense( num_outputs, kernel_regularizer=loc_regularizer, name='loc_layer', dtype=dtype) if num_factors > 0: if parameter_efficient: - self._scale_layer_homoscedastic = tf.keras.layers.Dense( + self._scale_layer_homoscedastic = tf.python.keras.layers.Dense( num_outputs, name='scale_layer_homoscedastic', dtype=dtype) - self._scale_layer_heteroscedastic = tf.keras.layers.Dense( + self._scale_layer_heteroscedastic = tf.python.keras.layers.Dense( num_outputs, name='scale_layer_heteroscedastic', dtype=dtype) else: - self._scale_layer = tf.keras.layers.Dense( + self._scale_layer = tf.python.keras.layers.Dense( num_outputs * num_factors, name='scale_layer', dtype=dtype) - self._diag_layer = tf.keras.layers.Dense( + self._diag_layer = tf.python.keras.layers.Dense( num_outputs, activation=tf.math.softplus, name='diag_layer', bias_initializer='zeros', dtype=dtype) @@ -923,24 +923,24 @@ def get_config(self): 'num_outputs': self._num_outputs, 'num_factors': self._num_factors, 'parameter_efficient': self._parameter_efficient, - 'loc_layer': tf.keras.layers.serialize(self._loc_layer), - 'diag_layer': tf.keras.layers.serialize(self._diag_layer), + 'loc_layer': tf.python.keras.layers.serialize(self._loc_layer), + 'diag_layer': tf.python.keras.layers.serialize(self._diag_layer), } if self._parameter_efficient: - config['scale_layer_homoscedastic'] = tf.keras.layers.serialize( + config['scale_layer_homoscedastic'] = tf.python.keras.layers.serialize( self._scale_layer_homoscedastic) - config['scale_layer_heteroscedastic'] = tf.keras.layers.serialize( + config['scale_layer_heteroscedastic'] = tf.python.keras.layers.serialize( self._scale_layer_heteroscedastic) else: - config['scale_layer'] = tf.keras.layers.serialize(self._scale_layer) + config['scale_layer'] = tf.python.keras.layers.serialize(self._scale_layer) new_config = super().get_config() new_config.update(config) return new_config -class ExactSigmoidDense(tf.keras.layers.Layer): +class ExactSigmoidDense(tf.python.keras.layers.Layer): """Exact diagonal covariance method for binary/multilabel classification.""" def __init__(self, num_outputs, logit_noise=tfp.distributions.Normal, @@ -953,11 +953,11 @@ def __init__(self, num_outputs, logit_noise=tfp.distributions.Normal, exactly. We do not need to make the softmax/sigmoid approximation and we do not need to use Monte Carlo estimation. - This layer is a drop in replacement for a tf.keras.layers.Dense output + This layer is a drop in replacement for a tf.python.keras.layers.Dense output layer for binary and multilabel classification problems, simply change: ```python - logits = tf.keras.layers.Dense(num_outputs, ...)(x) + logits = tf.python.keras.layers.Dense(num_outputs, ...)(x) ``` to @@ -976,7 +976,7 @@ def __init__(self, num_outputs, logit_noise=tfp.distributions.Normal, latent distribution. If experiencing numerical instability during training, increasing this value may help. logits_only: Boolean. If True, only return the logits from the __call__ - method. Set True to serialize tf.keras.Sequential models. + method. Set True to serialize tf.python.keras.Sequential models. dtype: Tensorflow dtype. The dtype of output Tensor and weights associated with the layer. name: String. The name of the layer used for name scoping. @@ -994,10 +994,10 @@ def __init__(self, num_outputs, logit_noise=tfp.distributions.Normal, tfp.distributions.Logistic): raise ValueError('logit_noise must be Normal or Logistic') - self._loc_layer = tf.keras.layers.Dense(num_outputs, name='loc_layer', + self._loc_layer = tf.python.keras.layers.Dense(num_outputs, name='loc_layer', dtype=dtype) - self._diag_layer = tf.keras.layers.Dense( + self._diag_layer = tf.python.keras.layers.Dense( num_outputs, activation=tf.math.softplus, name='diag_layer', dtype=dtype) @@ -1046,8 +1046,8 @@ def __call__(self, inputs, training=True): def get_config(self): config = { - 'loc_layer': tf.keras.layers.serialize(self._loc_layer), - 'diag_layer': tf.keras.layers.serialize(self._diag_layer), + 'loc_layer': tf.python.keras.layers.serialize(self._loc_layer), + 'diag_layer': tf.python.keras.layers.serialize(self._diag_layer), 'num_outputs': self._num_outputs, 'logit_noise': self._logit_noise, 'min_scale': self._min_scale, @@ -1059,7 +1059,7 @@ def get_config(self): return new_config -class EnsembleHeteroscedasticOutputs(tf.keras.layers.Layer): +class EnsembleHeteroscedasticOutputs(tf.python.keras.layers.Layer): """Ensembles multiple heteroscedastic output layers.""" def __init__(self, num_classes, layers, ensemble_weighting, @@ -1080,7 +1080,7 @@ def __init__(self, num_classes, layers, ensemble_weighting, Args: num_classes: Integer. Number of classes for classification task. - layers: Tuple of tf.keras.layers.Layer from heteroscedastic.py. + layers: Tuple of tf.python.keras.layers.Layer from heteroscedastic.py. ensemble_weighting: Tuple of len(layers) representing a probability distribution over layers. averaging: String `ensemble_cross_ent` or `gibbs_cross_ent`. For diff --git a/edward2/tensorflow/layers/heteroscedastic_test.py b/edward2/tensorflow/layers/heteroscedastic_test.py index ee36b633..1b4c67bd 100644 --- a/edward2/tensorflow/layers/heteroscedastic_test.py +++ b/edward2/tensorflow/layers/heteroscedastic_test.py @@ -114,7 +114,7 @@ def test_cases(): },) -class Classifier(tf.keras.Model): +class Classifier(tf.python.keras.Model): """Wrapper for classifiers defined below. Handles different architectures and differences between eager/graph execution. @@ -161,7 +161,7 @@ def call(self, inputs, **kwargs): return self.classifier(inputs, **kwargs) -class DenseClassifier(tf.keras.Model): +class DenseClassifier(tf.python.keras.Model): """Feedforward neural network with MCSoftmaxDense output layer.""" def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, @@ -191,7 +191,7 @@ def __init__(self, num_classes, logit_noise=tfp.distributions.Normal, """ super(DenseClassifier, self).__init__() - self.hidden_layer = tf.keras.layers.Dense(16) + self.hidden_layer = tf.python.keras.layers.Dense(16) self.output_layer = ed.layers.MCSoftmaxDense( num_classes=num_classes, logit_noise=logit_noise, temperature=temperature, train_mc_samples=train_mc_samples, @@ -213,7 +213,7 @@ def call(self, inputs, training=True, seed=None): return self.output_layer(hidden_x, training=training, seed=seed) -class DenseFAClassifier(tf.keras.Model): +class DenseFAClassifier(tf.python.keras.Model): """Feedforward neural network with MCSoftmaxDenseFA output layer.""" def __init__(self, num_classes, num_factors, @@ -244,7 +244,7 @@ def __init__(self, num_classes, num_factors, """ super(DenseFAClassifier, self).__init__() - self.hidden_layer = tf.keras.layers.Dense(16) + self.hidden_layer = tf.python.keras.layers.Dense(16) self.output_layer = ed.layers.MCSoftmaxDenseFA( num_classes=num_classes, num_factors=num_factors, temperature=temperature, parameter_efficient=parameter_efficient, @@ -267,7 +267,7 @@ def call(self, inputs, training=True, seed=None): return self.output_layer(hidden_x, training=training, seed=seed) -class SigmoidDenseFAClassifier(tf.keras.Model): +class SigmoidDenseFAClassifier(tf.python.keras.Model): """Feedforward neural network with MCSigmoidDenseFA output layer.""" def __init__(self, num_classes, num_factors, @@ -298,7 +298,7 @@ def __init__(self, num_classes, num_factors, """ super(SigmoidDenseFAClassifier, self).__init__() - self.hidden_layer = tf.keras.layers.Dense(16) + self.hidden_layer = tf.python.keras.layers.Dense(16) self.output_layer = ed.layers.MCSigmoidDenseFA( 1 if num_classes == 2 else num_classes, num_factors=num_factors, temperature=temperature, parameter_efficient=parameter_efficient, @@ -321,7 +321,7 @@ def call(self, inputs, training=True, seed=None): return self.output_layer(hidden_x, training=training, seed=seed) -class ExactSigmoidDenseClassifier(tf.keras.Model): +class ExactSigmoidDenseClassifier(tf.python.keras.Model): """Feedforward neural network with ExactSigmoidDense output layer.""" def __init__(self, num_classes, logit_noise): @@ -340,7 +340,7 @@ def __init__(self, num_classes, logit_noise): """ super(ExactSigmoidDenseClassifier, self).__init__() - self.hidden_layer = tf.keras.layers.Dense(16) + self.hidden_layer = tf.python.keras.layers.Dense(16) self.output_layer = ed.layers.ExactSigmoidDense( 1 if num_classes == 2 else num_classes, logit_noise=logit_noise) @@ -359,7 +359,7 @@ def call(self, inputs, training=True, seed=None): return self.output_layer(hidden_x, training=training) -class EnsembleClassifier(tf.keras.Model): +class EnsembleClassifier(tf.python.keras.Model): """Feedforward neural network with Ensemble output layer.""" def __init__(self, num_classes, averaging, ensemble_weighting=(0.8, 0.2)): @@ -384,7 +384,7 @@ def __init__(self, num_classes, averaging, ensemble_weighting=(0.8, 0.2)): """ super(EnsembleClassifier, self).__init__() - self.hidden_layer = tf.keras.layers.Dense(16) + self.hidden_layer = tf.python.keras.layers.Dense(16) if num_classes == 2: layer_1 = ed.layers.MCSigmoidDenseFA(1) layer_2 = ed.layers.ExactSigmoidDense(1) @@ -507,12 +507,12 @@ def test_train_step(self, logit_noise, num_classes, model_type): classifier = Classifier(model_type, num_classes, logit_noise) if num_classes == 2: - loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) + loss_fn = tf.python.keras.losses.BinaryCrossentropy(from_logits=True) else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + loss_fn = tf.python.keras.losses.SparseCategoricalCrossentropy(from_logits=True) if tf.executing_eagerly(): - optimizer = tf.keras.optimizers.Adam() + optimizer = tf.python.keras.optimizers.Adam() def train_step(inputs, labels, model): """Defines a single training step: Update weights based on one batch.""" with tf.GradientTape() as tape: diff --git a/edward2/tensorflow/layers/made.py b/edward2/tensorflow/layers/made.py index a680842d..bed0d76f 100644 --- a/edward2/tensorflow/layers/made.py +++ b/edward2/tensorflow/layers/made.py @@ -19,7 +19,7 @@ import tensorflow as tf -class MADE(tf.keras.Model): +class MADE(tf.python.keras.Model): """Masked autoencoder for distribution estimation (Germain et al., 2015). MADE takes as input a real Tensor of shape [..., length, channels] and returns @@ -66,9 +66,9 @@ def __init__(self, self.hidden_dims = hidden_dims self.input_order = input_order self.hidden_order = hidden_order - self.activation = tf.keras.activations.get(activation) + self.activation = tf.python.keras.activations.get(activation) self.use_bias = use_bias - self.network = tf.keras.Sequential([]) + self.network = tf.python.keras.Sequential([]) def build(self, input_shape): input_shape = tf.TensorShape(input_shape) @@ -83,14 +83,14 @@ def build(self, input_shape): hidden_order=self.hidden_order) # Input-to-hidden layer: [..., length, channels] -> [..., hidden_dims[0]]. - self.network.add(tf.keras.layers.Reshape([length * channels])) + self.network.add(tf.python.keras.layers.Reshape([length * channels])) # Tile the mask so each element repeats contiguously; this is compatible # with the autoregressive contraints unlike naive tiling. mask = masks[0] mask = tf.tile(mask[:, tf.newaxis, :], [1, channels, 1]) mask = tf.reshape(mask, [mask.shape[0] * channels, mask.shape[-1]]) if self.hidden_dims: - layer = tf.keras.layers.Dense( + layer = tf.python.keras.layers.Dense( self.hidden_dims[0], kernel_initializer=make_masked_initializer(mask), kernel_constraint=make_masked_constraint(mask), @@ -100,7 +100,7 @@ def build(self, input_shape): # Hidden-to-hidden layers: [..., hidden_dims[l-1]] -> [..., hidden_dims[l]]. for l in range(1, len(self.hidden_dims)): - layer = tf.keras.layers.Dense( + layer = tf.python.keras.layers.Dense( self.hidden_dims[l], kernel_initializer=make_masked_initializer(masks[l]), kernel_constraint=make_masked_constraint(masks[l]), @@ -115,14 +115,14 @@ def build(self, input_shape): mask = masks[-1] mask = tf.tile(mask[..., tf.newaxis], [1, 1, self.units]) mask = tf.reshape(mask, [mask.shape[0], mask.shape[1] * self.units]) - layer = tf.keras.layers.Dense( + layer = tf.python.keras.layers.Dense( length * self.units, kernel_initializer=make_masked_initializer(mask), kernel_constraint=make_masked_constraint(mask), activation=None, use_bias=self.use_bias) self.network.add(layer) - self.network.add(tf.keras.layers.Reshape([length, self.units])) + self.network.add(tf.python.keras.layers.Reshape([length, self.units])) self.built = True def call(self, inputs): @@ -217,7 +217,7 @@ def create_masks(input_dim, def make_masked_initializer(mask): - initializer = tf.keras.initializers.GlorotUniform() + initializer = tf.python.keras.initializers.GlorotUniform() def masked_initializer(shape, dtype=None): return mask * initializer(shape, dtype) return masked_initializer diff --git a/edward2/tensorflow/layers/neural_process.py b/edward2/tensorflow/layers/neural_process.py index f606627f..31e0950f 100644 --- a/edward2/tensorflow/layers/neural_process.py +++ b/edward2/tensorflow/layers/neural_process.py @@ -36,9 +36,9 @@ def batch_mlp(inputs, hidden_sizes): hidden = tf.reshape(inputs, (-1, filter_size)) for size in hidden_sizes[:-1]: - hidden = tf.keras.layers.Dense(size, activation=tf.nn.relu)(hidden) + hidden = tf.python.keras.layers.Dense(size, activation=tf.nn.relu)(hidden) - output = tf.keras.layers.Dense(hidden_sizes[-1], activation=None)(hidden) + output = tf.python.keras.layers.Dense(hidden_sizes[-1], activation=None)(hidden) output = tf.reshape(output, (batch_size, -1, hidden_sizes[-1])) return output @@ -125,22 +125,22 @@ def multihead_attention(q, k, v, num_heads=8): d_k = q.shape.as_list()[-1] d_v = v.shape.as_list()[-1] head_size = int(d_v / num_heads) - key_initializer = tf.keras.initializers.RandomNormal(stddev=d_k**-0.5) - value_initializer = tf.keras.initializers.RandomNormal(stddev=d_v**-0.5) + key_initializer = tf.python.keras.initializers.RandomNormal(stddev=d_k**-0.5) + value_initializer = tf.python.keras.initializers.RandomNormal(stddev=d_v**-0.5) rep = tf.constant(0.0) for h in range(num_heads): o = dot_product_attention( - tf.keras.layers.Conv1D( + tf.python.keras.layers.Conv1D( head_size, 1, kernel_initializer=key_initializer, name='wq%d' % h, use_bias=False, padding='VALID')(q), - tf.keras.layers.Conv1D( + tf.python.keras.layers.Conv1D( head_size, 1, kernel_initializer=key_initializer, name='wk%d' % h, use_bias=False, padding='VALID')(k), - tf.keras.layers.Conv1D( + tf.python.keras.layers.Conv1D( head_size, 1, kernel_initializer=key_initializer, name='wv%d' % h, use_bias=False, padding='VALID')(v), normalise=True) - rep += tf.keras.layers.Conv1D(d_v, 1, kernel_initializer=value_initializer, + rep += tf.python.keras.layers.Conv1D(d_v, 1, kernel_initializer=value_initializer, name='wo%d' % h, use_bias=False, padding='VALID')(o) return rep @@ -217,7 +217,7 @@ def __call__(self, x1, x2, r): # TODO(adityagrover): Make the encoder and decoder configurable. -class NeuralProcess(tf.keras.Model): +class NeuralProcess(tf.python.keras.Model): """Attentive Neural Process (Kim et al., 2019; Garnelo et al., 2018).""" def __init__(self, @@ -264,11 +264,11 @@ def latent_encoder(self, x, y): per_example_embedding = batch_mlp( encoder_input, self._latent_encoder_sizes) dataset_embedding = tf.reduce_mean(per_example_embedding, axis=1) - hidden = tf.keras.layers.Dense( + hidden = tf.python.keras.layers.Dense( (self._latent_encoder_sizes[-1] + self._num_latents)//2, activation=tf.nn.relu)(dataset_embedding) - loc = tf.keras.layers.Dense(self._num_latents, activation=None)(hidden) - untransformed_scale = tf.keras.layers.Dense(self._num_latents, + loc = tf.python.keras.layers.Dense(self._num_latents, activation=None)(hidden) + untransformed_scale = tf.python.keras.layers.Dense(self._num_latents, activation=None)(hidden) # Constraint scale following Garnelo et al. (2018). scale_diag = 0.1 + 0.9 * tf.sigmoid(untransformed_scale) diff --git a/edward2/tensorflow/layers/neural_process_test.py b/edward2/tensorflow/layers/neural_process_test.py index 4ad8a5d1..db0f3aba 100644 --- a/edward2/tensorflow/layers/neural_process_test.py +++ b/edward2/tensorflow/layers/neural_process_test.py @@ -41,7 +41,7 @@ def train_neural_process(model, Returns: best_loss: (float) Average validation loss of best early-stopped model. """ - optimizer = tf.keras.optimizers.Adam(learning_rate) + optimizer = tf.python.keras.optimizers.Adam(learning_rate) context_x, context_y, target_x, target_y = train_data valid_context_x, valid_context_y, valid_target_x, valid_target_y = valid_data train_data_size = target_x.shape[0] diff --git a/edward2/tensorflow/layers/noise.py b/edward2/tensorflow/layers/noise.py index a9597af0..a172ce9a 100644 --- a/edward2/tensorflow/layers/noise.py +++ b/edward2/tensorflow/layers/noise.py @@ -22,7 +22,7 @@ import tensorflow_probability as tfp -class NCPNormalPerturb(tf.keras.layers.Layer): +class NCPNormalPerturb(tf.python.keras.layers.Layer): """Noise contrastive prior for continuous inputs (Hafner et al., 2018). The layer doubles the inputs' batch size and adds a random normal perturbation @@ -43,14 +43,14 @@ class NCPNormalPerturb(tf.keras.layers.Layer): inputs = keras.Input(shape=(25,)) x = ed.layers.NCPNormalPerturb()(inputs) # double input batch - x = tf.keras.layers.Dense(64, activation='relu')(x) - x = tf.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) means = ed.layers.DenseVariationalDropout(1, activation=None)(x) # get mean means = ed.layers.NCPNormalOutput(labels)(means) # halve input batch - stddevs = tf.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) - outputs = tf.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, + stddevs = tf.python.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, stddevs]) - model = tf.keras.Model(inputs=inputs, outputs=outputs) + model = tf.python.keras.Model(inputs=inputs, outputs=outputs) # Run training loop. num_steps = 1000 @@ -89,7 +89,7 @@ def call(self, inputs): return tf.concat([inputs, perturbed_inputs], 0) -class NCPCategoricalPerturb(tf.keras.layers.Layer): +class NCPCategoricalPerturb(tf.python.keras.layers.Layer): """Noise contrastive prior for discrete inputs (Hafner et al., 2018). The layer doubles the inputs' batch size and randomly flips categories @@ -111,14 +111,14 @@ class NCPCategoricalPerturb(tf.keras.layers.Layer): inputs = keras.Input(shape=(25,)) x = ed.layers.NCPCategoricalPerturb(10)(inputs) # double input batch - x = tf.keras.layers.Dense(64, activation='relu')(x) - x = tf.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) means = ed.layers.DenseVariationalDropout(1, activation=None)(x) # get mean means = ed.layers.NCPNormalOutput(labels)(means) # halve input batch - stddevs = tf.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) - outputs = tf.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, + stddevs = tf.python.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, stddevs]) - model = tf.keras.Model(inputs=inputs, outputs=outputs) + model = tf.python.keras.Model(inputs=inputs, outputs=outputs) # Run training loop. num_steps = 1000 @@ -162,7 +162,7 @@ def call(self, inputs): return tf.concat([inputs, flipped_inputs], 0) -class NCPNormalOutput(tf.keras.layers.Layer): +class NCPNormalOutput(tf.python.keras.layers.Layer): """Noise contrastive prior for continuous outputs (Hafner et al., 2018). The layer returns the first half of the inputs' batch. It computes a KL @@ -188,14 +188,14 @@ class NCPNormalOutput(tf.keras.layers.Layer): inputs = keras.Input(shape=(25,)) x = ed.layers.NCPNormalPerturb()(inputs) # double input batch - x = tf.keras.layers.Dense(64, activation='relu')(x) - x = tf.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) + x = tf.python.keras.layers.Dense(64, activation='relu')(x) means = ed.layers.DenseVariationalDropout(1, activation=None)(x) # get mean means = ed.layers.NCPNormalOutput(labels)(means) # halve input batch - stddevs = tf.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) - outputs = tf.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, + stddevs = tf.python.keras.layers.Dense(1, activation='softplus')(x[:batch_size]) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Normal(x[0], x[1]))([means, stddevs]) - model = tf.keras.Model(inputs=inputs, outputs=outputs) + model = tf.python.keras.Model(inputs=inputs, outputs=outputs) # Run training loop. num_steps = 1000 diff --git a/edward2/tensorflow/layers/normalization.py b/edward2/tensorflow/layers/normalization.py index 672a75f8..21ef4bf8 100644 --- a/edward2/tensorflow/layers/normalization.py +++ b/edward2/tensorflow/layers/normalization.py @@ -37,7 +37,7 @@ import tensorflow.compat.v1 as tf1 -class ActNorm(tf.keras.layers.Layer): +class ActNorm(tf.python.keras.layers.Layer): """Actnorm, an affine reversible layer (Prafulla and Kingma, 2018). Weights use data-dependent initialization in which outputs have zero mean @@ -45,7 +45,7 @@ class ActNorm(tf.keras.layers.Layer): are computed from the first batch of inputs. """ - def __init__(self, epsilon=tf.keras.backend.epsilon(), **kwargs): + def __init__(self, epsilon=tf.python.keras.backend.epsilon(), **kwargs): super(ActNorm, self).__init__(**kwargs) self.epsilon = epsilon @@ -114,18 +114,18 @@ def ensemble_batchnorm(x, ensemble_size=1, use_tpu=True, **kwargs): # BatchNormalization layer for all ensemble member. This is not correct in # math but works in practice. if ensemble_size == 1 or use_tpu: - return tf.keras.layers.BatchNormalization(**kwargs)(x) + return tf.python.keras.layers.BatchNormalization(**kwargs)(x) name = kwargs.get('name') split_inputs = tf.split(x, ensemble_size, axis=0) for i in range(ensemble_size): if name is not None: kwargs['name'] = name + '_{}'.format(i) - split_inputs[i] = tf.keras.layers.BatchNormalization(**kwargs)( + split_inputs[i] = tf.python.keras.layers.BatchNormalization(**kwargs)( split_inputs[i]) return tf.concat(split_inputs, axis=0) -class EnsembleSyncBatchNorm(tf.keras.layers.Layer): +class EnsembleSyncBatchNorm(tf.python.keras.layers.Layer): """BatchNorm that averages over ALL replicas. Only works for `NHWC` inputs.""" def __init__(self, axis=3, ensemble_size=1, momentum=0.99, epsilon=0.001, @@ -281,7 +281,7 @@ def call(self, inputs, training=None): return x -class SpectralNormalization(tf.keras.layers.Wrapper): +class SpectralNormalization(tf.python.keras.layers.Wrapper): """Implements spectral normalization for Dense layer.""" def __init__(self, @@ -295,7 +295,7 @@ def __init__(self, """Initializer. Args: - layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to. + layer: (tf.python.keras.layers.Layer) A TF Keras layer to apply normalization to. iteration: (int) The number of power iteration to perform to estimate weight matrix's singular value. norm_multiplier: (float) Multiplicative constant to threshold the @@ -319,8 +319,8 @@ def __init__(self, if inhere_layer_name: wrapper_name = layer.name - if not isinstance(layer, tf.keras.layers.Layer): - raise ValueError('`layer` must be a `tf.keras.layer.Layer`. ' + if not isinstance(layer, tf.python.keras.layers.Layer): + raise ValueError('`layer` must be a `tf.python.keras.layer.Layer`. ' 'Observed `{}`'.format(layer)) super(SpectralNormalization, self).__init__( layer, name=wrapper_name, **kwargs) @@ -394,7 +394,7 @@ def restore_weights(self): return self.layer.kernel.assign(self.w) -class SpectralNormalizationConv2D(tf.keras.layers.Wrapper): +class SpectralNormalizationConv2D(tf.python.keras.layers.Wrapper): """Implements spectral normalization for Conv2D layer based on [3].""" def __init__(self, @@ -408,7 +408,7 @@ def __init__(self, """Initializer. Args: - layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to. + layer: (tf.python.keras.layers.Layer) A TF Keras layer to apply normalization to. iteration: (int) The number of power iteration to perform to estimate weight matrix's singular value. norm_multiplier: (float) Multiplicative constant to threshold the @@ -433,9 +433,9 @@ def __init__(self, # Set layer attributes. layer._name += '_spec_norm' - if not isinstance(layer, tf.keras.layers.Conv2D): + if not isinstance(layer, tf.python.keras.layers.Conv2D): raise ValueError( - 'layer must be a `tf.keras.layer.Conv2D` instance. You passed: {input}' + 'layer must be a `tf.python.keras.layer.Conv2D` instance. You passed: {input}' .format(input=layer)) super(SpectralNormalizationConv2D, self).__init__(layer, **kwargs) diff --git a/edward2/tensorflow/layers/normalization_test.py b/edward2/tensorflow/layers/normalization_test.py index 1387a50a..ed831ca8 100644 --- a/edward2/tensorflow/layers/normalization_test.py +++ b/edward2/tensorflow/layers/normalization_test.py @@ -27,8 +27,8 @@ import numpy as np import tensorflow as tf -DenseLayer = tf.keras.layers.Dense(10) -Conv2DLayer = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid') +DenseLayer = tf.python.keras.layers.Dense(10) +Conv2DLayer = tf.python.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid') def _compute_spectral_norm(weight): diff --git a/edward2/tensorflow/layers/random_feature.py b/edward2/tensorflow/layers/random_feature.py index cdabf1ca..8fbdd0f6 100644 --- a/edward2/tensorflow/layers/random_feature.py +++ b/edward2/tensorflow/layers/random_feature.py @@ -31,7 +31,7 @@ _SUPPORTED_LIKELIHOOD = ('binary_logistic', 'poisson', 'gaussian') -class RandomFeatureGaussianProcess(tf.keras.layers.Layer): +class RandomFeatureGaussianProcess(tf.python.keras.layers.Layer): """Gaussian process layer with random feature approximation. During training, the model updates the maximum a posteriori (MAP) logits @@ -103,7 +103,7 @@ def __init__(self, by sqrt(2. / num_inducing). return_random_features: (bool) Whether to also return random features. use_custom_random_features: (bool) Whether to use custom random - features implemented using tf.keras.layers.Dense. + features implemented using tf.python.keras.layers.Dense. custom_random_features_initializer: (str or callable) Initializer for the random features. Default to random normal which approximates a RBF kernel function if activation function is cos. @@ -156,8 +156,8 @@ def __init__(self, if self.custom_random_features_activation is None: self.custom_random_features_activation = tf.math.cos - self.dense_layer = tf.keras.layers.Dense - self.input_normalization_layer = tf.keras.layers.LayerNormalization + self.dense_layer = tf.python.keras.layers.Dense + self.input_normalization_layer = tf.python.keras.layers.LayerNormalization def build(self, input_shape): self._build_sublayer_classes() @@ -176,7 +176,7 @@ def build(self, input_shape): trainable=False, name='gp_random_feature') else: - self._random_feature = tf.keras.layers.experimental.RandomFourierFeatures( + self._random_feature = tf.python.keras.layers.experimental.RandomFourierFeatures( output_dim=self.num_inducing, kernel_initializer=self.gp_kernel_type, scale=self.gp_kernel_scale, @@ -193,7 +193,7 @@ def build(self, input_shape): self._gp_output_layer = self.dense_layer( units=self.units, use_bias=False, - kernel_regularizer=tf.keras.regularizers.l2(self.l2_regularization), + kernel_regularizer=tf.python.keras.regularizers.l2(self.l2_regularization), dtype=self.dtype, name='gp_output_weights', **self.gp_output_kwargs) @@ -206,9 +206,9 @@ def build(self, input_shape): def _build_sublayer_classes(self): """Defines sublayer classes.""" self.bias_layer = tf.Variable - self.dense_layer = tf.keras.layers.Dense + self.dense_layer = tf.python.keras.layers.Dense self.covariance_layer = LaplaceRandomFeatureCovariance - self.input_normalization_layer = tf.keras.layers.LayerNormalization + self.input_normalization_layer = tf.python.keras.layers.LayerNormalization def reset_covariance_matrix(self): """Resets covariance matrix of the GP layer. @@ -248,7 +248,7 @@ def call(self, inputs, global_step=None, training=None): return gp_output, gp_covmat -class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer): +class LaplaceRandomFeatureCovariance(tf.python.keras.layers.Layer): """Computes the Gaussian Process covariance using Laplace method. At training time, this layer updates the Gaussian process posterior using @@ -301,7 +301,7 @@ def build(self, input_shape): name='gp_precision_matrix', shape=(gp_feature_dim, gp_feature_dim), dtype=self.dtype, - initializer=tf.keras.initializers.Identity(self.ridge_penalty), + initializer=tf.python.keras.initializers.Identity(self.ridge_penalty), trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)) @@ -395,7 +395,7 @@ def compute_predictive_covariance(self, gp_feature): def _get_training_value(self, training=None): if training is None: - training = tf.keras.backend.learning_phase() + training = tf.python.keras.backend.learning_phase() if isinstance(training, int): training = bool(training) diff --git a/edward2/tensorflow/layers/random_feature_test.py b/edward2/tensorflow/layers/random_feature_test.py index cb8456d4..ec7e4a2e 100644 --- a/edward2/tensorflow/layers/random_feature_test.py +++ b/edward2/tensorflow/layers/random_feature_test.py @@ -172,9 +172,9 @@ def test_state_saving_and_loading(self): input_data = np.random.random((1, 2)) rfgp_model = ed.layers.RandomFeatureGaussianProcess(units=1) - inputs = tf.keras.Input((2,), batch_size=1) + inputs = tf.python.keras.Input((2,), batch_size=1) outputs = rfgp_model(inputs) - model = tf.keras.Model(inputs, outputs) + model = tf.python.keras.Model(inputs, outputs) gp_output, gp_covmat = model.predict(input_data) # Save and then load the model. @@ -182,7 +182,7 @@ def test_state_saving_and_loading(self): self.addCleanup(shutil.rmtree, temp_dir) saved_model_dir = os.path.join(temp_dir, 'rfgp_model') model.save(saved_model_dir) - new_model = tf.keras.models.load_model(saved_model_dir) + new_model = tf.python.keras.models.load_model(saved_model_dir) gp_output_new, gp_covmat_new = new_model.predict(input_data) self.assertAllClose(gp_output, gp_output_new, atol=1e-4) diff --git a/edward2/tensorflow/layers/recurrent.py b/edward2/tensorflow/layers/recurrent.py index a0ad9b35..25dc7e98 100644 --- a/edward2/tensorflow/layers/recurrent.py +++ b/edward2/tensorflow/layers/recurrent.py @@ -28,7 +28,7 @@ @utils.add_weight -class LSTMCellReparameterization(tf.keras.layers.LSTMCell): +class LSTMCellReparameterization(tf.python.keras.layers.LSTMCell): """Bayesian LSTM cell class estimated via reparameterization. The layer computes a variational Bayesian approximation to the distribution @@ -114,11 +114,11 @@ def build( if self.use_bias: if (self.unit_forget_bias and not isinstance(self.bias_initializer, - tf.keras.layers.Layer)): + tf.python.keras.layers.Layer)): def bias_initializer(_, *args, **kwargs): - return tf.keras.backend.concatenate([ + return tf.python.keras.backend.concatenate([ self.bias_initializer((self.units,), *args, **kwargs), - tf.keras.initializers.Ones()((self.units,), *args, **kwargs), + tf.python.keras.initializers.Ones()((self.units,), *args, **kwargs), self.bias_initializer((self.units * 2,), *args, **kwargs), ]) else: @@ -146,12 +146,12 @@ def call(self, *args, **kwargs): def call_weights(self): """Calls any weights if the initializer is itself a layer.""" - if isinstance(self.kernel_initializer, tf.keras.layers.Layer): + if isinstance(self.kernel_initializer, tf.python.keras.layers.Layer): self.kernel = self.kernel_initializer(self.kernel.shape, self.dtype) - if isinstance(self.recurrent_initializer, tf.keras.layers.Layer): + if isinstance(self.recurrent_initializer, tf.python.keras.layers.Layer): self.recurrent_kernel = self.recurrent_initializer( self.recurrent_kernel.shape, self.dtype) - if isinstance(self.bias_initializer, tf.keras.layers.Layer): + if isinstance(self.bias_initializer, tf.python.keras.layers.Layer): self.bias = self.bias_initializer(self.bias.shape, self.dtype) self.called_weights = True @@ -170,13 +170,13 @@ def _compute_carry_and_output(self, x, h_tm1, c_tm1): # of distributions. recurrent_kernel = tf.convert_to_tensor(self.recurrent_kernel) i = self.recurrent_activation( - x_i + tf.keras.backend.dot(h_tm1_i, recurrent_kernel[:, :self.units])) - f = self.recurrent_activation(x_f + tf.keras.backend.dot( + x_i + tf.python.keras.backend.dot(h_tm1_i, recurrent_kernel[:, :self.units])) + f = self.recurrent_activation(x_f + tf.python.keras.backend.dot( h_tm1_f, recurrent_kernel[:, self.units:self.units * 2])) - c = f * c_tm1 + i * self.activation(x_c + tf.keras.backend.dot( + c = f * c_tm1 + i * self.activation(x_c + tf.python.keras.backend.dot( h_tm1_c, recurrent_kernel[:, self.units * 2:self.units * 3])) o = self.recurrent_activation( - x_o + tf.keras.backend.dot( + x_o + tf.python.keras.backend.dot( h_tm1_o, recurrent_kernel[:, self.units * 3:])) return c, o @@ -253,14 +253,14 @@ def _compute_carry_and_output(self, x, h_tm1, c_tm1): p_i, p_f, p_c, p_o = tf.split(perturbation, num_or_size_splits=4, axis=1) so_i, so_f, so_c, so_o = tf.split(self.recurrent_sign_output, num_or_size_splits=4, axis=1) - z0 = (x_i + tf.keras.backend.dot(h_tm1_i, k_i) + - tf.keras.backend.dot(h_tm1_i * self.recurrent_sign_input, p_i) * so_i) - z1 = (x_f + tf.keras.backend.dot(h_tm1_f, k_f) + - tf.keras.backend.dot(h_tm1_f * self.recurrent_sign_input, p_f) * so_f) - z2 = (x_c + tf.keras.backend.dot(h_tm1_c, k_c) + - tf.keras.backend.dot(h_tm1_c * self.recurrent_sign_input, p_c) * so_c) - z3 = (x_o + tf.keras.backend.dot(h_tm1_o, k_o) + - tf.keras.backend.dot(h_tm1_o * self.recurrent_sign_input, p_o) * so_o) + z0 = (x_i + tf.python.keras.backend.dot(h_tm1_i, k_i) + + tf.python.keras.backend.dot(h_tm1_i * self.recurrent_sign_input, p_i) * so_i) + z1 = (x_f + tf.python.keras.backend.dot(h_tm1_f, k_f) + + tf.python.keras.backend.dot(h_tm1_f * self.recurrent_sign_input, p_f) * so_f) + z2 = (x_c + tf.python.keras.backend.dot(h_tm1_c, k_c) + + tf.python.keras.backend.dot(h_tm1_c * self.recurrent_sign_input, p_c) * so_c) + z3 = (x_o + tf.python.keras.backend.dot(h_tm1_o, k_o) + + tf.python.keras.backend.dot(h_tm1_o * self.recurrent_sign_input, p_o) * so_o) i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) @@ -301,21 +301,21 @@ def call(self, inputs, states, training=None): p_i, p_f, p_c, p_o = tf.split(perturbation, num_or_size_splits=4, axis=1) so_i, so_f, so_c, so_o = tf.split(self.sign_output, num_or_size_splits=4, axis=1) - x_i = (tf.keras.backend.dot(inputs_i, k_i) + - tf.keras.backend.dot(inputs_i * self.sign_input, p_i) * so_i) - x_f = (tf.keras.backend.dot(inputs_f, k_f) + - tf.keras.backend.dot(inputs_f * self.sign_input, p_f) * so_f) - x_c = (tf.keras.backend.dot(inputs_c, k_c) + - tf.keras.backend.dot(inputs_c * self.sign_input, p_c) * so_c) - x_o = (tf.keras.backend.dot(inputs_o, k_o) + - tf.keras.backend.dot(inputs_o * self.sign_input, p_o) * so_o) + x_i = (tf.python.keras.backend.dot(inputs_i, k_i) + + tf.python.keras.backend.dot(inputs_i * self.sign_input, p_i) * so_i) + x_f = (tf.python.keras.backend.dot(inputs_f, k_f) + + tf.python.keras.backend.dot(inputs_f * self.sign_input, p_f) * so_f) + x_c = (tf.python.keras.backend.dot(inputs_c, k_c) + + tf.python.keras.backend.dot(inputs_c * self.sign_input, p_c) * so_c) + x_o = (tf.python.keras.backend.dot(inputs_o, k_o) + + tf.python.keras.backend.dot(inputs_o * self.sign_input, p_o) * so_o) if self.use_bias: b_i, b_f, b_c, b_o = tf.split( self.bias, num_or_size_splits=4, axis=0) - x_i = tf.keras.backend.bias_add(x_i, b_i) - x_f = tf.keras.backend.bias_add(x_f, b_f) - x_c = tf.keras.backend.bias_add(x_c, b_c) - x_o = tf.keras.backend.bias_add(x_o, b_o) + x_i = tf.python.keras.backend.bias_add(x_i, b_i) + x_f = tf.python.keras.backend.bias_add(x_f, b_f) + x_c = tf.python.keras.backend.bias_add(x_c, b_c) + x_o = tf.python.keras.backend.bias_add(x_o, b_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] @@ -335,18 +335,18 @@ def call(self, inputs, states, training=None): inputs = inputs * dp_mask[0] kernel_mean = self.kernel.distribution.mean() perturbation = self.kernel - kernel_mean - z = tf.keras.backend.dot(inputs, kernel_mean) - z += tf.keras.backend.dot(inputs * self.sign_input, + z = tf.python.keras.backend.dot(inputs, kernel_mean) + z += tf.python.keras.backend.dot(inputs * self.sign_input, perturbation) * self.sign_output if 0. < self.recurrent_dropout < 1.: h_tm1 = h_tm1 * rec_dp_mask[0] recurrent_kernel_mean = self.recurrent_kernel.distribution.mean() perturbation = self.recurrent_kernel - recurrent_kernel_mean - z += tf.keras.backend.dot(h_tm1, recurrent_kernel_mean) - z += tf.keras.backend.dot(h_tm1 * self.recurrent_sign_input, + z += tf.python.keras.backend.dot(h_tm1, recurrent_kernel_mean) + z += tf.python.keras.backend.dot(h_tm1 * self.recurrent_sign_input, perturbation) * self.recurrent_sign_output if self.use_bias: - z = tf.keras.backend.bias_add(z, self.bias) + z = tf.python.keras.backend.bias_add(z, self.bias) z = tf.split(z, num_or_size_splits=4, axis=1) c, o = self._compute_carry_and_output_fused(z, c_tm1) @@ -356,7 +356,7 @@ def call(self, inputs, states, training=None): @utils.add_weight -class LSTMCellRank1(tf.keras.layers.LSTMCell): +class LSTMCellRank1(tf.python.keras.layers.LSTMCell): """A rank-1 Bayesian neural net LSTM cell layer (Dusenberry et al., 2020). The layer computes a variational Bayesian approximation to the distribution @@ -588,12 +588,12 @@ def build(self, input_shape): if self.use_bias: if (self.unit_forget_bias and not isinstance(self.bias_initializer, - tf.keras.layers.Layer)): + tf.python.keras.layers.Layer)): def bias_initializer(_, *args, **kwargs): return tf.concat([ self.bias_initializer([self.ensemble_size, self.units], *args, **kwargs), - tf.keras.initializers.Ones()([self.ensemble_size, self.units], + tf.python.keras.initializers.Ones()([self.ensemble_size, self.units], *args, **kwargs), self.bias_initializer([self.ensemble_size, self.units * 2], *args, **kwargs), @@ -645,7 +645,7 @@ def _sample_weights(self, inputs=None, batch_size=None, dtype=None): # Sample parameters for each input example. def sample(weight_variable, weight_initializer, shape): - if isinstance(weight_initializer, tf.keras.layers.Layer): + if isinstance(weight_initializer, tf.python.keras.layers.Layer): weights = weight_initializer( shape, self.dtype).distribution.sample(examples_per_model) weights = tf.transpose(weights, [1, 0, 2]) @@ -759,8 +759,8 @@ def get_config(self): """Returns the configuration for the layer.""" config = { 'units': self.units, - 'activation': tf.keras.activations.serialize(self.activation), - 'recurrent_activation': tf.keras.activations.serialize( + 'activation': tf.python.keras.activations.serialize(self.activation), + 'recurrent_activation': tf.python.keras.activations.serialize( self.recurrent_activation), 'use_bias': self.use_bias, 'alpha_initializer': initializers.serialize(self.alpha_initializer), diff --git a/edward2/tensorflow/layers/recurrent_test.py b/edward2/tensorflow/layers/recurrent_test.py index 47075a04..fe250f3a 100644 --- a/edward2/tensorflow/layers/recurrent_test.py +++ b/edward2/tensorflow/layers/recurrent_test.py @@ -151,7 +151,7 @@ def testLSTMCellLoss(self, lstm_cell): cell(features[:, 0, :], state) # ensure robustness after multiple calls cell.get_initial_state(features[:, 0, :]) cell(features[:, 0, :], state) # ensure robustness after multiple calls - nll = tf.keras.losses.mean_squared_error(labels, predictions) + nll = tf.python.keras.losses.mean_squared_error(labels, predictions) kl = sum(cell.losses) variables = [ @@ -176,7 +176,7 @@ def testLSTMCellLoss(self, lstm_cell): with tf.GradientTape(persistent=True) as tape: cell.get_initial_state(features[:, 0, :]) predictions, _ = cell(features[:, 0, :], state) # build is not called - nll = tf.keras.losses.mean_squared_error(labels, predictions) + nll = tf.python.keras.losses.mean_squared_error(labels, predictions) kl = sum(cell.losses) variables = [ @@ -207,8 +207,8 @@ def testLSTMCellModel(self, lstm_cell): hidden_size = 10 inputs = np.random.rand(batch_size, timesteps, dim).astype(np.float32) cell = lstm_cell(hidden_size) - model = tf.keras.Sequential([ - tf.keras.layers.RNN(cell, return_sequences=True) + model = tf.python.keras.Sequential([ + tf.python.keras.layers.RNN(cell, return_sequences=True) ]) outputs1 = model(inputs) outputs2 = model(inputs) @@ -246,7 +246,7 @@ def testLSTMCellRank1BatchEnsemble(self, alpha_initializer, gamma_initializer, recurrent_gamma_initializer, bias_initializer, use_bias, implementation, use_additive_perturbation): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 4 examples_per_model = 4 input_dim = 5 @@ -323,7 +323,7 @@ def testLSTMCellRank1AlphaGamma(self, alpha_initializer, gamma_initializer, recurrent_alpha_initializer, recurrent_gamma_initializer, implementation, use_additive_perturbation): - tf.keras.backend.set_learning_phase(1) # training time + tf.python.keras.backend.set_learning_phase(1) # training time ensemble_size = 4 batch_size = 5 * ensemble_size output_dim = 4 @@ -363,8 +363,8 @@ def testLSTMCellRank1Model(self, ensemble_size, implementation, hidden_size, use_bias=use_bias, implementation=implementation, use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size) - model = tf.keras.Sequential([ - tf.keras.layers.RNN(cell, return_sequences=True) + model = tf.python.keras.Sequential([ + tf.python.keras.layers.RNN(cell, return_sequences=True) ]) outputs1 = model(inputs) diff --git a/edward2/tensorflow/layers/stochastic_output.py b/edward2/tensorflow/layers/stochastic_output.py index 9258235e..700ba5b3 100644 --- a/edward2/tensorflow/layers/stochastic_output.py +++ b/edward2/tensorflow/layers/stochastic_output.py @@ -37,12 +37,12 @@ dataset = tf.data.Dataset.from_tensor_slices((numpy_features, numpy_labels)) dataset = dataset.repeat().batch(batch_size) -model = tf.keras.Sequential([ - tf.keras.layers.Dense(num_classes), - tf.keras.layers.Lambda(lambda inputs: ed.Categorical(logits=inputs)), +model = tf.python.keras.Sequential([ + tf.python.keras.layers.Dense(num_classes), + tf.python.keras.layers.Lambda(lambda inputs: ed.Categorical(logits=inputs)), ]) -model.compile(tf.keras.optimizers.Adam(0.1), +model.compile(tf.python.keras.optimizers.Adam(0.1), loss=lambda y_true, y_pred: -y_pred.distribution.log_prob(y_true)) model.fit(dataset, steps_per_epoch=dataset_size // batch_size, @@ -56,7 +56,7 @@ import tensorflow as tf -class MixtureLogistic(tf.keras.layers.Layer): +class MixtureLogistic(tf.python.keras.layers.Layer): """Stochastic output layer, distributed as a mixture of logistics. Given an input tensor of shape [..., input_dim], the output layer returns @@ -74,7 +74,7 @@ def __init__(self, self.logits_constraint = constraints.get(logits_constraint) self.loc_constraint = constraints.get(loc_constraint) self.scale_constraint = constraints.get(scale_constraint) - self.layer = tf.keras.layers.Dense(num_components * 3) + self.layer = tf.python.keras.layers.Dense(num_components * 3) def build(self, input_shape=None): self.layer.build(input_shape) diff --git a/edward2/tensorflow/layers/stochastic_output_test.py b/edward2/tensorflow/layers/stochastic_output_test.py index 4bd2d174..c7d0b1b4 100644 --- a/edward2/tensorflow/layers/stochastic_output_test.py +++ b/edward2/tensorflow/layers/stochastic_output_test.py @@ -27,8 +27,8 @@ def testMixtureLogistic(self): batch_size = 3 features = np.random.rand(batch_size, 4).astype(np.float32) labels = np.random.rand(batch_size).astype(np.float32) - model = tf.keras.Sequential([ - tf.keras.layers.Dense(2, activation=None), + model = tf.python.keras.Sequential([ + tf.python.keras.layers.Dense(2, activation=None), ed.layers.MixtureLogistic(5), ]) outputs = model(features) diff --git a/edward2/tensorflow/layers/utils.py b/edward2/tensorflow/layers/utils.py index 70b5be7a..fd50de16 100644 --- a/edward2/tensorflow/layers/utils.py +++ b/edward2/tensorflow/layers/utils.py @@ -52,10 +52,10 @@ def _add_weight(self, self.tracked_add_weight_dependencies = [] self.tracked_add_weight_dependencies.append((regularizer, initializer)) - if isinstance(regularizer, tf.keras.layers.Layer): + if isinstance(regularizer, tf.python.keras.layers.Layer): if not regularizer.built: regularizer.build(shape) - if isinstance(initializer, tf.keras.layers.Layer): + if isinstance(initializer, tf.python.keras.layers.Layer): with tf.name_scope(name): weight = initializer(shape, dtype) if regularizer is not None: diff --git a/edward2/tensorflow/layers/utils_test.py b/edward2/tensorflow/layers/utils_test.py index f31efac1..0bb1de9e 100644 --- a/edward2/tensorflow/layers/utils_test.py +++ b/edward2/tensorflow/layers/utils_test.py @@ -24,7 +24,7 @@ class UtilsTest(parameterized.TestCase, tf.test.TestCase): def testAddWeightWithTrainableInitializer(self): - dense_wrapped = ed.layers.utils.add_weight(tf.keras.layers.Dense) + dense_wrapped = ed.layers.utils.add_weight(tf.python.keras.layers.Dense) initializer = ed.initializers.get('trainable_normal') layer = dense_wrapped(2, kernel_initializer=initializer, name='dense') inputs = tf.random.normal([1, 3]) @@ -39,7 +39,7 @@ def testAddWeightWithTrainableInitializer(self): self.assertEqual(layer_weights_names[2], 'dense/kernel/stddev:0') def testAddWeightWithTrainableRegularizer(self): - dense_wrapped = ed.layers.utils.add_weight(tf.keras.layers.Dense) + dense_wrapped = ed.layers.utils.add_weight(tf.python.keras.layers.Dense) regularizer = ed.regularizers.get('trainable_normal_kl_divergence_stddev') layer = dense_wrapped(2, kernel_regularizer=regularizer) inputs = tf.random.normal([1, 3]) diff --git a/edward2/tensorflow/regularizers.py b/edward2/tensorflow/regularizers.py index 6ec9a80d..7283030d 100644 --- a/edward2/tensorflow/regularizers.py +++ b/edward2/tensorflow/regularizers.py @@ -15,7 +15,7 @@ """Regularizers. -This module extends `tf.keras.regularizers` with two features: +This module extends `tf.python.keras.regularizers` with two features: 1. Regularizers which compute using any weight random variables' distribution. For example, consider a regularizer which computes an analytic KL @@ -24,7 +24,7 @@ parameters. For example, consider a weight regularizer which computes a KL divergence from the weights towards a learnable prior. -One subtlety is how `tf.keras.constraints` are used on the parameters of +One subtlety is how `tf.python.keras.constraints` are used on the parameters of trainable regularizers. Typically, Keras constraints are used with projected gradient descent, where one performs unconstrained optimization and then applies a projection (the constraint) after each gradient update. To stay in line with @@ -39,7 +39,7 @@ import tensorflow as tf -class CauchyKLDivergence(tf.keras.regularizers.Regularizer): +class CauchyKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to the Cauchy distribution.""" def __init__(self, loc=0., scale=1., scale_factor=1.): @@ -68,7 +68,7 @@ def get_config(self): } -class HalfCauchyKLDivergence(tf.keras.regularizers.Regularizer): +class HalfCauchyKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to the half-Cauchy distribution.""" def __init__(self, loc=0., scale=1., scale_factor=1.): @@ -97,7 +97,7 @@ def get_config(self): } -class LogUniformKLDivergence(tf.keras.regularizers.Regularizer): +class LogUniformKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to the log-uniform distribution.""" def __init__(self, scale_factor=1.): @@ -115,7 +115,7 @@ def __call__(self, x): mean = x.distribution.mean() log_variance = tf.math.log(x.distribution.variance()) log_alpha = log_variance - tf.math.log(tf.square(mean) + - tf.keras.backend.epsilon()) + tf.python.keras.backend.epsilon()) log_alpha = tf.clip_by_value(log_alpha, -8., 8.) # Set magic numbers for cubic polynomial approx. (Molchanov et al., 2017). @@ -133,7 +133,7 @@ def get_config(self): } -class LogNormalKLDivergence(tf.keras.regularizers.Regularizer): +class LogNormalKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to the log normal distribution.""" def __init__(self, loc=0., scale=1., scale_factor=1.): @@ -163,7 +163,7 @@ def get_config(self): } -class NormalKLDivergence(tf.keras.regularizers.Regularizer): +class NormalKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to the normal distribution.""" def __init__(self, mean=0., stddev=1., scale_factor=1.): @@ -264,7 +264,7 @@ def get_config(self): } -class NormalKLDivergenceWithTiedMean(tf.keras.regularizers.Regularizer): +class NormalKLDivergenceWithTiedMean(tf.python.keras.regularizers.Regularizer): """KL with normal prior whose mean is fixed at the variational posterior's.""" def __init__(self, stddev=1., scale_factor=1.): @@ -290,12 +290,12 @@ def get_config(self): } -class TrainableNormalKLDivergenceStdDev(tf.keras.layers.Layer): +class TrainableNormalKLDivergenceStdDev(tf.python.keras.layers.Layer): """Normal KL divergence with trainable stddev parameter.""" def __init__(self, mean=0., - stddev_initializer=tf.keras.initializers.TruncatedNormal( + stddev_initializer=tf.python.keras.initializers.TruncatedNormal( mean=0.5413248, stddev=0.1), # mean=softplus_inverse(1.) stddev_regularizer=None, stddev_constraint='softplus', @@ -304,7 +304,7 @@ def __init__(self, **kwargs): super(TrainableNormalKLDivergenceStdDev, self).__init__(**kwargs) self.mean = mean - self.stddev_initializer = tf.keras.initializers.get(stddev_initializer) + self.stddev_initializer = tf.python.keras.initializers.get(stddev_initializer) self.stddev_regularizer = get(stddev_regularizer) self.stddev_constraint = constraints.get(stddev_constraint) self.scale_factor = scale_factor @@ -338,7 +338,7 @@ def get_config(self): return { 'loc': self.loc, 'stddev_initializer': - tf.keras.initializers.serialize(self.stddev_initializer), + tf.python.keras.initializers.serialize(self.stddev_initializer), 'stddev_regularizer': serialize(self.stddev_regularizer), 'stddev_constraint': constraints.serialize(self.stddev_constraint), 'scale_factor': self.scale_factor, @@ -346,7 +346,7 @@ def get_config(self): } -class UniformKLDivergence(tf.keras.regularizers.Regularizer): +class UniformKLDivergence(tf.python.keras.regularizers.Regularizer): """KL divergence regularizer from an input to a uniform distribution. This regularizer computes the negative entropy of the input variable, which @@ -371,7 +371,7 @@ def get_config(self): } -# Compatibility aliases, following tf.keras +# Compatibility aliases, following tf.python.keras # pylint: disable=invalid-name cauchy_kl_divergence = CauchyKLDivergence @@ -384,15 +384,15 @@ def get_config(self): uniform_kl_divergence = UniformKLDivergence # pylint: enable=invalid-name -# Utility functions, following tf.keras +# Utility functions, following tf.python.keras def serialize(initializer): - return tf.keras.utils.serialize_keras_object(initializer) + return tf.python.keras.utils.serialize_keras_object(initializer) def deserialize(config, custom_objects=None): - return tf.keras.utils.deserialize_keras_object( + return tf.python.keras.utils.deserialize_keras_object( config, module_objects=globals(), custom_objects=custom_objects, @@ -418,4 +418,4 @@ def get(identifier, value=None): pass elif callable(identifier): return identifier - return tf.keras.regularizers.get(value) + return tf.python.keras.regularizers.get(value) diff --git a/edward2/tensorflow/regularizers_test.py b/edward2/tensorflow/regularizers_test.py index 642fbfdb..412deac6 100644 --- a/edward2/tensorflow/regularizers_test.py +++ b/edward2/tensorflow/regularizers_test.py @@ -179,7 +179,7 @@ def testUniformKLDivergence(self): def testRegularizersGet(self): self.assertIsInstance(ed.regularizers.get('normal_kl_divergence'), ed.regularizers.NormalKLDivergence) - self.assertIsInstance(ed.regularizers.get('l2'), tf.keras.regularizers.L2) + self.assertIsInstance(ed.regularizers.get('l2'), tf.python.keras.regularizers.L2) self.assertIsNone(ed.regularizers.get('')) if __name__ == '__main__': diff --git a/edward2/tensorflow/transformed_random_variable_test.py b/edward2/tensorflow/transformed_random_variable_test.py index 4c51c8b4..01d8e9b2 100644 --- a/edward2/tensorflow/transformed_random_variable_test.py +++ b/edward2/tensorflow/transformed_random_variable_test.py @@ -23,7 +23,7 @@ class TransformedRandomVariableTest(tf.test.TestCase): def testTransformedRandomVariable(self): - class Exp(tf.keras.layers.Layer): + class Exp(tf.python.keras.layers.Layer): """Exponential activation function for reversible networks.""" def __call__(self, inputs, *args, **kwargs): diff --git a/examples/notebooks/Companion.ipynb b/examples/notebooks/Companion.ipynb index 00c7b60d..cb154e10 100644 --- a/examples/notebooks/Companion.ipynb +++ b/examples/notebooks/Companion.ipynb @@ -1066,7 +1066,7 @@ " mask_values = tf.reshape(mask_values, [1, len(self.production_rules)])\n", " return mask_values\n", "\n", - "class ProbabilisticGrammar(tf.keras.Model):\n", + "class ProbabilisticGrammar(tf.python.keras.Model):\n", " \"\"\"Deep generative model over productions which follow a grammar.\"\"\"\n", "\n", " def __init__(self, grammar, latent_size, num_units):\n", @@ -1075,7 +1075,7 @@ " self.grammar = grammar\n", " self.latent_size = latent_size\n", " self.lstm = tf.nn.rnn_cell.LSTMCell(num_units)\n", - " self.output_layer = tf.keras.layers.Dense(len(grammar.production_rules))\n", + " self.output_layer = tf.python.keras.layers.Dense(len(grammar.production_rules))\n", "\n", " def call(self, inputs):\n", " \"\"\"Runs the model forward to generate a sequence of productions.\"\"\"\n", @@ -1101,23 +1101,23 @@ " t += 1\n", " return tf.stack(productions, axis=1)\n", "\n", - "class ProbabilisticGrammarVariational(tf.keras.Model):\n", + "class ProbabilisticGrammarVariational(tf.python.keras.Model):\n", " \"\"\"Amortized variational posterior for a probabilistic grammar.\"\"\"\n", "\n", " def __init__(self, latent_size):\n", " \"\"\"Constructs a variational posterior for a probabilistic grammar.\"\"\"\n", " super(ProbabilisticGrammarVariational, self).__init__()\n", " self.latent_size = latent_size\n", - " self.encoder_net = tf.keras.Sequential([\n", - " tf.keras.layers.Conv1D(64, 3, padding=\"SAME\"),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.Activation(tf.nn.elu),\n", - " tf.keras.layers.Conv1D(128, 3, padding=\"SAME\"),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.Activation(tf.nn.elu),\n", - " tf.keras.layers.Dropout(0.1),\n", - " tf.keras.layers.GlobalAveragePooling1D(),\n", - " tf.keras.layers.Dense(latent_size * 2, activation=None),\n", + " self.encoder_net = tf.python.keras.Sequential([\n", + " tf.python.keras.layers.Conv1D(64, 3, padding=\"SAME\"),\n", + " tf.python.keras.layers.BatchNormalization(),\n", + " tf.python.keras.layers.Activation(tf.nn.elu),\n", + " tf.python.keras.layers.Conv1D(128, 3, padding=\"SAME\"),\n", + " tf.python.keras.layers.BatchNormalization(),\n", + " tf.python.keras.layers.Activation(tf.nn.elu),\n", + " tf.python.keras.layers.Dropout(0.1),\n", + " tf.python.keras.layers.GlobalAveragePooling1D(),\n", + " tf.python.keras.layers.Dense(latent_size * 2, activation=None),\n", " ])\n", "\n", " def call(self, inputs):\n", @@ -1234,17 +1234,17 @@ }, "outputs": [], "source": [ - "class DeepLatentGaussianModel(tf.keras.Model):\n", + "class DeepLatentGaussianModel(tf.python.keras.Model):\n", " \"\"\"Deep generative model.\"\"\"\n", " def __init__(self, latent_size, data_shape, batch_size):\n", " super(DeepLatentGaussianModel, self).__init__()\n", " self.latent_size = latent_size\n", " self.data_shape = data_shape\n", " self.batch_size = batch_size\n", - " self.decoder_net = tf.keras.Sequential([\n", - " tf.keras.layers.Dense(512, activation=tf.nn.relu),\n", - " tf.keras.layers.Dense(np.prod(data_shape), activation=None),\n", - " tf.keras.layers.Reshape(data_shape),\n", + " self.decoder_net = tf.python.keras.Sequential([\n", + " tf.python.keras.layers.Dense(512, activation=tf.nn.relu),\n", + " tf.python.keras.layers.Dense(np.prod(data_shape), activation=None),\n", + " tf.python.keras.layers.Reshape(data_shape),\n", " ])\n", "\n", " def call(self, inputs):\n", @@ -1256,7 +1256,7 @@ " data = ed.Categorical(logits=self.decoder_net(latent_code), name=\"data\")\n", " return data\n", "\n", - "class DeepLatentGaussianModelVariational(tf.keras.Model):\n", + "class DeepLatentGaussianModelVariational(tf.python.keras.Model):\n", " \"\"\"Amortized variational posterior.\"\"\"\n", " def __init__(self,\n", " latent_size,\n", @@ -1270,10 +1270,10 @@ " self.num_transitions = num_transitions\n", " self.target_log_prob_fn = target_log_prob_fn\n", " self.step_size = step_size\n", - " self.encoder_net = tf.keras.Sequential([\n", - " tf.keras.layers.Reshape(np.prod(data_shape)),\n", - " tf.keras.layers.Dense(512, activation=tf.nn.relu),\n", - " tf.keras.layers.Dense(latent_size * 2, activation=None),\n", + " self.encoder_net = tf.python.keras.Sequential([\n", + " tf.python.keras.layers.Reshape(np.prod(data_shape)),\n", + " tf.python.keras.layers.Dense(512, activation=tf.nn.relu),\n", + " tf.python.keras.layers.Dense(latent_size * 2, activation=None),\n", " ])\n", " \n", " def call(self, inputs):\n", diff --git a/experimental/attentive_uncertainty/attention.py b/experimental/attentive_uncertainty/attention.py index 3bc5e9a6..e94d9003 100644 --- a/experimental/attentive_uncertainty/attention.py +++ b/experimental/attentive_uncertainty/attention.py @@ -194,7 +194,7 @@ def multihead_attention(projection_nets, return rep -class AttentionLayer(tf.keras.layers.Layer): +class AttentionLayer(tf.python.keras.layers.Layer): """The Attention module.""" def __init__(self, @@ -235,27 +235,27 @@ def build(self, input_shape): self.multihead_nets = [] for h in range(num_heads): - query_net = tf.keras.Sequential( - [tf.keras.layers.InputLayer([None, d_k]), - tf.keras.layers.Conv1D(head_size, 1, + query_net = tf.python.keras.Sequential( + [tf.python.keras.layers.InputLayer([None, d_k]), + tf.python.keras.layers.Conv1D(head_size, 1, kernel_initializer=key_initializer, name='wq%d' % h, use_bias=False, padding='VALID')]) - key_net = tf.keras.Sequential( - [tf.keras.layers.InputLayer([None, d_k]), - tf.keras.layers.Conv1D(head_size, 1, + key_net = tf.python.keras.Sequential( + [tf.python.keras.layers.InputLayer([None, d_k]), + tf.python.keras.layers.Conv1D(head_size, 1, kernel_initializer=key_initializer, name='wk%d' % h, use_bias=False, padding='VALID')]) - value_net = tf.keras.Sequential( - [tf.keras.layers.InputLayer([None, d_v]), - tf.keras.layers.Conv1D(head_size, 1, + value_net = tf.python.keras.Sequential( + [tf.python.keras.layers.InputLayer([None, d_v]), + tf.python.keras.layers.Conv1D(head_size, 1, kernel_initializer=key_initializer, name='wv%d' % h, use_bias=False, padding='VALID')]) - rep_net = tf.keras.Sequential( - [tf.keras.layers.InputLayer([None, head_size]), - tf.keras.layers.Conv1D(d_v, 1, + rep_net = tf.python.keras.Sequential( + [tf.python.keras.layers.InputLayer([None, head_size]), + tf.python.keras.layers.Conv1D(d_v, 1, kernel_initializer=value_initializer, name='wo%d' % h, use_bias=False, padding='VALID')]) diff --git a/experimental/attentive_uncertainty/colabs/2019_09_11_gnp_1d_train.ipynb b/experimental/attentive_uncertainty/colabs/2019_09_11_gnp_1d_train.ipynb index e9ba733e..4f3ca0ca 100644 --- a/experimental/attentive_uncertainty/colabs/2019_09_11_gnp_1d_train.ipynb +++ b/experimental/attentive_uncertainty/colabs/2019_09_11_gnp_1d_train.ipynb @@ -35,7 +35,7 @@ "from experimental.attentive_uncertainty import utils\n # local file import", "\n", "tf.enable_eager_execution()\n", - "tf.keras.backend.clear_session()" + "tf.python.keras.backend.clear_session()" ] }, { @@ -120,7 +120,7 @@ "\n", "def training_loop(model,\n", " dataset_train,\n", - " optimizer=tf.keras.optimizers.Adam,\n", + " optimizer=tf.python.keras.optimizers.Adam,\n", " learning_rate=1e-3,\n", " savepath=None):\n", "\n", @@ -171,7 +171,7 @@ "input_dim = 1\n", "output_dim = 1\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4" ] }, @@ -21859,7 +21859,7 @@ " model_type=model_type,\n", " data_uncertainty=data_uncertainty)\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4\n", "training_loop(model,\n", " dataset_train,\n", @@ -22613,7 +22613,7 @@ " model_type=model_type,\n", " data_uncertainty=data_uncertainty)\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4\n", "training_loop(model,\n", " dataset_train,\n", @@ -23260,7 +23260,7 @@ " model_type=model_type,\n", " data_uncertainty=data_uncertainty)\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4\n", "training_loop(model,\n", " dataset_train,\n", @@ -23907,7 +23907,7 @@ " model_type=model_type,\n", " data_uncertainty=data_uncertainty)\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4\n", "training_loop(model,\n", " dataset_train,\n", @@ -24554,7 +24554,7 @@ " model_type=model_type,\n", " data_uncertainty=data_uncertainty)\n", "\n", - "optimizer = tf.keras.optimizers.Adam\n", + "optimizer = tf.python.keras.optimizers.Adam\n", "learning_rate = 1e-4\n", "training_loop(model,\n", " dataset_train,\n", diff --git a/experimental/attentive_uncertainty/generalized_neural_process.py b/experimental/attentive_uncertainty/generalized_neural_process.py index ab236443..df47daea 100644 --- a/experimental/attentive_uncertainty/generalized_neural_process.py +++ b/experimental/attentive_uncertainty/generalized_neural_process.py @@ -24,10 +24,10 @@ import tensorflow.compat.v1 as tf -eps = tf.keras.backend.epsilon() +eps = tf.python.keras.backend.epsilon() -class Regressor(tf.keras.Model): +class Regressor(tf.python.keras.Model): r"""Generalized neural process regressor. A generalized neural process (GNP) expresses the following generative process diff --git a/experimental/attentive_uncertainty/layers.py b/experimental/attentive_uncertainty/layers.py index 193b907c..e28e77c4 100644 --- a/experimental/attentive_uncertainty/layers.py +++ b/experimental/attentive_uncertainty/layers.py @@ -19,10 +19,10 @@ import edward2 as ed import tensorflow.compat.v1 as tf -eps = tf.keras.backend.epsilon() +eps = tf.python.keras.backend.epsilon() -class DataNoise(tf.keras.layers.Layer): +class DataNoise(tf.python.keras.layers.Layer): """Creates a variable for modeling homoskedastic noise.""" def build(self, input_shape=None): @@ -36,7 +36,7 @@ def call(self, inputs): return self.untransformed_data_var -class DatasetEncodingLayer(tf.keras.layers.Layer): +class DatasetEncodingLayer(tf.python.keras.layers.Layer): """Encodes a dataset of (x, y) pairs into embeddings via a shared network.""" def __init__(self, @@ -57,7 +57,7 @@ def call(self, x, y): return x_y_encodings -class GlobalLatentLayer(tf.keras.layers.Layer): +class GlobalLatentLayer(tf.python.keras.layers.Layer): """Maps embedded (x, y) points to a single stochastic embedding.""" def __init__(self, net): @@ -71,7 +71,7 @@ def call(self, avg_dataset_encodings): return ed.Normal(loc=mean, scale=std) -class LocalLatentLayer(tf.keras.layers.Layer): +class LocalLatentLayer(tf.python.keras.layers.Layer): """Maps conditioning inputs to a per-point stochastic embedding.""" def __init__(self, net): @@ -91,7 +91,7 @@ def call(self, return ed.Normal(loc=mean, scale=std) -class DecoderLayer(tf.keras.layers.Layer): +class DecoderLayer(tf.python.keras.layers.Layer): """Maps conditioning inputs to a per-point predictive distribution.""" def __init__(self, @@ -126,7 +126,7 @@ def call(self, return ed.Normal(loc=mean, scale=std) -class SNPLocalLatentLayer(tf.keras.layers.Layer): +class SNPLocalLatentLayer(tf.python.keras.layers.Layer): """Maps each datapoint (and global conditioning) to stochastic embedding.""" def __init__(self, diff --git a/experimental/attentive_uncertainty/regressor.py b/experimental/attentive_uncertainty/regressor.py index 672cc2e1..84569980 100644 --- a/experimental/attentive_uncertainty/regressor.py +++ b/experimental/attentive_uncertainty/regressor.py @@ -23,10 +23,10 @@ import tensorflow.compat.v1 as tf -eps = tf.keras.backend.epsilon() +eps = tf.python.keras.backend.epsilon() -class Regressor(tf.keras.Model): +class Regressor(tf.python.keras.Model): r"""Structured neural process regressor. A structured neural process (SNP) expresses the following generative process diff --git a/experimental/attentive_uncertainty/utils.py b/experimental/attentive_uncertainty/utils.py index b439b8ba..418ccb14 100644 --- a/experimental/attentive_uncertainty/utils.py +++ b/experimental/attentive_uncertainty/utils.py @@ -64,10 +64,10 @@ def mlp_block(in_dim, hidden_sizes, activation=tf.nn.relu): tensor of shape [B, n, d_out] where d_out = hidden_sizes[-1] """ - net = tf.keras.Sequential([tf.keras.layers.InputLayer(in_dim)]) + net = tf.python.keras.Sequential([tf.python.keras.layers.InputLayer(in_dim)]) for size in hidden_sizes[:-1]: - net.add(tf.keras.layers.Dense(size, activation=activation)) - net.add(tf.keras.layers.Dense(hidden_sizes[-1], activation=None)) + net.add(tf.python.keras.layers.Dense(size, activation=activation)) + net.add(tf.python.keras.layers.Dense(hidden_sizes[-1], activation=None)) return net diff --git a/experimental/auxiliary_sampling/compute_metrics.py b/experimental/auxiliary_sampling/compute_metrics.py index 27617f60..4ace301b 100644 --- a/experimental/auxiliary_sampling/compute_metrics.py +++ b/experimental/auxiliary_sampling/compute_metrics.py @@ -106,7 +106,7 @@ def ensemble_metrics(x, Args: x: numpy array of inputs y: numpy array of labels - model: tf.keras.Model. + model: tf.python.keras.Model. log_likelihood_fn: keras function of log likelihood. For classification tasks, log_likelihood_fn(...)[1] should return the logits n_samples: number of Monte Carlo samples to draw per ensemble member (each diff --git a/experimental/auxiliary_sampling/deterministic_baseline/lenet5.py b/experimental/auxiliary_sampling/deterministic_baseline/lenet5.py index e0db0d72..9f7f6e52 100644 --- a/experimental/auxiliary_sampling/deterministic_baseline/lenet5.py +++ b/experimental/auxiliary_sampling/deterministic_baseline/lenet5.py @@ -21,27 +21,27 @@ def lenet5(input_shape, num_classes): """Builds LeNet5.""" - inputs = tf.keras.layers.Input(shape=input_shape) - conv1 = tf.keras.layers.Conv2D(6, + inputs = tf.python.keras.layers.Input(shape=input_shape) + conv1 = tf.python.keras.layers.Conv2D(6, kernel_size=5, padding='SAME', activation='relu')(inputs) - pool1 = tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + pool1 = tf.python.keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME')(conv1) - conv2 = tf.keras.layers.Conv2D(16, + conv2 = tf.python.keras.layers.Conv2D(16, kernel_size=5, padding='SAME', activation='relu')(pool1) - pool2 = tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + pool2 = tf.python.keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME')(conv2) - conv3 = tf.keras.layers.Conv2D(120, + conv3 = tf.python.keras.layers.Conv2D(120, kernel_size=5, padding='SAME', activation=tf.nn.relu)(pool2) - flatten = tf.keras.layers.Flatten()(conv3) - dense1 = tf.keras.layers.Dense(84, activation=tf.nn.relu)(flatten) - logits = tf.keras.layers.Dense(num_classes)(dense1) - outputs = tf.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(logits) - return tf.keras.Model(inputs=inputs, outputs=outputs) + flatten = tf.python.keras.layers.Flatten()(conv3) + dense1 = tf.python.keras.layers.Dense(84, activation=tf.nn.relu)(flatten) + logits = tf.python.keras.layers.Dense(num_classes)(dense1) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(logits) + return tf.python.keras.Model(inputs=inputs, outputs=outputs) diff --git a/experimental/auxiliary_sampling/deterministic_baseline/run_det_training.py b/experimental/auxiliary_sampling/deterministic_baseline/run_det_training.py index caf2635e..433d8bd9 100644 --- a/experimental/auxiliary_sampling/deterministic_baseline/run_det_training.py +++ b/experimental/auxiliary_sampling/deterministic_baseline/run_det_training.py @@ -90,7 +90,7 @@ def schedule_fn(epoch): rate *= 1e-1 return rate - lr_callback = tf.keras.callbacks.LearningRateScheduler(schedule_fn) + lr_callback = tf.python.keras.callbacks.LearningRateScheduler(schedule_fn) def negative_log_likelihood(y, rv_y): del rv_y # unused arg @@ -107,7 +107,7 @@ def log_likelihood(y_true, y_sample): return model.output.distribution.log_prob(tf.squeeze(y_true)) # pylint: disable=cell-var-from-loop model.compile( - optimizer=tf.keras.optimizers.Adam(lr=FLAGS.learning_rate), + optimizer=tf.python.keras.optimizers.Adam(lr=FLAGS.learning_rate), loss=negative_log_likelihood, metrics=[log_likelihood, accuracy]) member_dir = os.path.join(FLAGS.output_dir, 'member_' + str(i)) @@ -136,8 +136,8 @@ def log_likelihood(y_true, y_sample): ensemble_filenames.append(member_filename) model.save_weights(member_filename) - labels = tf.keras.layers.Input(shape=y_train.shape[1:]) - ll = tf.keras.backend.function([model.input, labels], [ + labels = tf.python.keras.layers.Input(shape=y_train.shape[1:]) + ll = tf.python.keras.backend.function([model.input, labels], [ model.output.distribution.log_prob(tf.squeeze(labels)), model.output.distribution.logits, ]) diff --git a/experimental/auxiliary_sampling/lenet5.py b/experimental/auxiliary_sampling/lenet5.py index ae03bd53..43396330 100644 --- a/experimental/auxiliary_sampling/lenet5.py +++ b/experimental/auxiliary_sampling/lenet5.py @@ -28,7 +28,7 @@ def lenet5(n_examples, input_shape, num_classes): def normalized_kl_fn(q, p, _): return q.kl_divergence(p) / tf.cast(n_examples, tf.float32) - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) conv1 = tfp.layers.Convolution2DFlipout( 6, kernel_size=5, @@ -40,7 +40,7 @@ def normalized_kl_fn(q, p, _): bias_posterior_fn=q_fn, kernel_divergence_fn=normalized_kl_fn, bias_divergence_fn=normalized_kl_fn)(inputs) - pool1 = tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + pool1 = tf.python.keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME')(conv1) conv2 = tfp.layers.Convolution2DFlipout( @@ -54,7 +54,7 @@ def normalized_kl_fn(q, p, _): bias_posterior_fn=q_fn, kernel_divergence_fn=normalized_kl_fn, bias_divergence_fn=normalized_kl_fn)(pool1) - pool2 = tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + pool2 = tf.python.keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME')(conv2) conv3 = tfp.layers.Convolution2DFlipout( @@ -68,7 +68,7 @@ def normalized_kl_fn(q, p, _): bias_posterior_fn=q_fn, kernel_divergence_fn=normalized_kl_fn, bias_divergence_fn=normalized_kl_fn)(pool2) - flatten = tf.keras.layers.Flatten()(conv3) + flatten = tf.python.keras.layers.Flatten()(conv3) dense1 = tfp.layers.DenseLocalReparameterization( 84, activation=tf.nn.relu, @@ -86,5 +86,5 @@ def normalized_kl_fn(q, p, _): bias_posterior_fn=q_fn, kernel_divergence_fn=normalized_kl_fn, bias_divergence_fn=normalized_kl_fn)(dense1) - outputs = tf.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(dense2) - return tf.keras.models.Model(inputs=inputs, outputs=outputs) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(dense2) + return tf.python.keras.models.Model(inputs=inputs, outputs=outputs) diff --git a/experimental/auxiliary_sampling/res_net.py b/experimental/auxiliary_sampling/res_net.py index 9995217a..3b6bfba3 100644 --- a/experimental/auxiliary_sampling/res_net.py +++ b/experimental/auxiliary_sampling/res_net.py @@ -24,7 +24,7 @@ import tensorflow.compat.v1 as tf import tensorflow_probability as tfp -keras = tf.keras +keras = tf.python.keras def _resnet_layer(inputs, @@ -204,7 +204,7 @@ def res_net(n_examples, model (Model): Keras model instance whose output is a tfp.distributions.Categorical distribution. """ - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = build_resnet_v1( inputs, depth=20, @@ -225,5 +225,5 @@ def normalized_kl_fn(q, p, _): bias_posterior_fn=q_fn, kernel_divergence_fn=normalized_kl_fn, bias_divergence_fn=normalized_kl_fn)(x) - outputs = tf.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(logits) - return tf.keras.models.Model(inputs=inputs, outputs=outputs) + outputs = tf.python.keras.layers.Lambda(lambda x: ed.Categorical(logits=x))(logits) + return tf.python.keras.models.Model(inputs=inputs, outputs=outputs) diff --git a/experimental/auxiliary_sampling/run_training.py b/experimental/auxiliary_sampling/run_training.py index 277b7841..f627f1df 100644 --- a/experimental/auxiliary_sampling/run_training.py +++ b/experimental/auxiliary_sampling/run_training.py @@ -107,7 +107,7 @@ def main(argv): if not FLAGS.resnet: model = lenet5(n_train, x_train.shape[1:], num_classes) else: - datagen = tf.keras.preprocessing.image.ImageDataGenerator( + datagen = tf.python.keras.preprocessing.image.ImageDataGenerator( rotation_range=90, width_shift_range=0.1, height_shift_range=0.1, @@ -132,7 +132,7 @@ def schedule_fn(epoch): rate *= 1e-1 return float(rate) - lr_callback = tf.keras.callbacks.LearningRateScheduler(schedule_fn) + lr_callback = tf.python.keras.callbacks.LearningRateScheduler(schedule_fn) for l in model.layers: l.kl_cost_weight = l.add_weight( @@ -197,7 +197,7 @@ def fit_fn(model, callbacks=callbacks if with_lr_schedule else [tensorboard]) model.compile( - optimizer=tf.keras.optimizers.Adam(lr=float(FLAGS.learning_rate)), + optimizer=tf.python.keras.optimizers.Adam(lr=float(FLAGS.learning_rate)), loss=negative_log_likelihood, metrics=metrics) session.run(tf1.initialize_all_variables()) @@ -205,8 +205,8 @@ def fit_fn(model, train_epochs = (FLAGS.training_steps * FLAGS.batch_size) // n_train fit_fn(model, FLAGS.training_steps) - labels = tf.keras.layers.Input(shape=y_train.shape[1:]) - ll = tf.keras.backend.function([model.input, labels], [ + labels = tf.python.keras.layers.Input(shape=y_train.shape[1:]) + ll = tf.python.keras.backend.function([model.input, labels], [ model.output.distribution.log_prob(tf.squeeze(labels)), model.output.distribution.logits ]) @@ -259,7 +259,7 @@ def fit_fn(model, for j in range(FLAGS.n_auxiliary_variables): session.run(sample_op) model.compile( - optimizer=tf.keras.optimizers.Adam( + optimizer=tf.python.keras.optimizers.Adam( # The learning rate is proportional to the scale of the prior. lr=float(FLAGS.learning_rate_for_sampling * np.sqrt(1. - FLAGS.auxiliary_variance_ratio)**j)), diff --git a/experimental/marginalization_mixup/batchensemble.py b/experimental/marginalization_mixup/batchensemble.py index 7a54e819..977f1f11 100644 --- a/experimental/marginalization_mixup/batchensemble.py +++ b/experimental/marginalization_mixup/batchensemble.py @@ -204,8 +204,8 @@ def main(argv): num_classes = ds_info.features['label'].num_classes if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -234,33 +234,33 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) - diversity_schedule = tf.keras.optimizers.schedules.ExponentialDecay( + diversity_schedule = tf.python.keras.optimizers.schedules.ExponentialDecay( FLAGS.diversity_coeff, FLAGS.diversity_decay_epoch * steps_per_epoch, decay_rate=0.97, staircase=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), - 'train/similarity': tf.keras.metrics.Mean(), - 'train/l2': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), + 'train/similarity': tf.python.keras.metrics.Mean(), + 'train/l2': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/member_accuracy_mean': ( - tf.keras.metrics.SparseCategoricalAccuracy()), + tf.python.keras.metrics.SparseCategoricalAccuracy()), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/member_ece_mean': um.ExpectedCalibrationError( num_bins=FLAGS.num_bins) } for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) metrics['test/ece_member_{}'.format(i)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -269,14 +269,14 @@ def main(argv): corrupt_diversity = {} if FLAGS.ensemble_size > 1: test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } training_diversity = { - 'train/disagreement': tf.keras.metrics.Mean(), - 'train/average_kl': tf.keras.metrics.Mean(), - 'train/cosine_similarity': tf.keras.metrics.Mean(), + 'train/disagreement': tf.python.keras.metrics.Mean(), + 'train/average_kl': tf.python.keras.metrics.Mean(), + 'train/cosine_similarity': tf.python.keras.metrics.Mean(), } if FLAGS.corruptions_interval > 0: @@ -285,21 +285,21 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) corrupt_metrics['test/member_acc_mean_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/member_ece_mean_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) corrupt_diversity['corrupt_diversity/average_kl_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() corrupt_diversity['corrupt_diversity/cosine_similarity_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() corrupt_diversity['corrupt_diversity/disagreement_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) @@ -362,12 +362,12 @@ def pairwise_cosine_distance(x): logits = tf.cast(logits, tf.float32) if FLAGS.mixup_alpha > 0 or FLAGS.label_smoothing > 0 or FLAGS.cutmix: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.categorical_crossentropy(labels, + tf.python.keras.losses.categorical_crossentropy(labels, logits, from_logits=True)) else: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) @@ -456,7 +456,7 @@ def step_fn(inputs): for i in range(FLAGS.ensemble_size): member_probs = per_probs[i] if dataset_name == 'clean': - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -487,7 +487,7 @@ def step_fn(inputs): probs = tf.reduce_mean(per_probs, axis=0) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) diff --git a/experimental/marginalization_mixup/batchensemble_model.py b/experimental/marginalization_mixup/batchensemble_model.py index 61f880f2..f62dee0f 100644 --- a/experimental/marginalization_mixup/batchensemble_model.py +++ b/experimental/marginalization_mixup/batchensemble_model.py @@ -21,7 +21,7 @@ BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9) EnsembleBatchNormalization = functools.partial( # pylint: disable=invalid-name @@ -40,7 +40,7 @@ def make_sign_initializer(random_sign_init): if random_sign_init > 0: return ed.initializers.RandomSign(random_sign_init) else: - return tf.keras.initializers.RandomNormal(mean=1.0, + return tf.python.keras.initializers.RandomNormal(mean=1.0, stddev=-random_sign_init) @@ -65,28 +65,28 @@ def basic_block(inputs, filters, strides, ensemble_size, if use_ensemble_bn: y = EnsembleBatchNormalization(ensemble_size=ensemble_size)(y) else: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2DBatchEnsemble( filters, strides=strides, alpha_initializer=make_sign_initializer(random_sign_init), gamma_initializer=make_sign_initializer(random_sign_init), - kernel_regularizer=tf.keras.regularizers.l2(l2), + kernel_regularizer=tf.python.keras.regularizers.l2(l2), ensemble_size=ensemble_size)(y) if use_ensemble_bn: y = EnsembleBatchNormalization(ensemble_size=ensemble_size)(y) else: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2DBatchEnsemble( filters, strides=1, alpha_initializer=make_sign_initializer(random_sign_init), gamma_initializer=make_sign_initializer(random_sign_init), - kernel_regularizer=tf.keras.regularizers.l2(l2), + kernel_regularizer=tf.python.keras.regularizers.l2(l2), ensemble_size=ensemble_size)(y) if not x.shape.is_compatible_with(y.shape): x = Conv2DBatchEnsemble( @@ -95,9 +95,9 @@ def basic_block(inputs, filters, strides, ensemble_size, strides=strides, alpha_initializer=make_sign_initializer(random_sign_init), gamma_initializer=make_sign_initializer(random_sign_init), - kernel_regularizer=tf.keras.regularizers.l2(l2), + kernel_regularizer=tf.python.keras.regularizers.l2(l2), ensemble_size=ensemble_size)(x) - x = tf.keras.layers.add([x, y]) + x = tf.python.keras.layers.add([x, y]) return x @@ -131,18 +131,18 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, use_ensemble_bn: Bool, whether to use ensemble batch norm. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = Conv2DBatchEnsemble( 16, strides=1, alpha_initializer=make_sign_initializer(random_sign_init), gamma_initializer=make_sign_initializer(random_sign_init), - kernel_regularizer=tf.keras.regularizers.l2(l2), + kernel_regularizer=tf.python.keras.regularizers.l2(l2), ensemble_size=ensemble_size)(inputs) for strides, filters in zip([1, 2, 2], [16, 32, 64]): x = group(x, @@ -157,18 +157,18 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, if use_ensemble_bn: x = EnsembleBatchNormalization(ensemble_size=ensemble_size)(x) else: - x = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) x = ed.layers.DenseBatchEnsemble( num_classes, alpha_initializer=make_sign_initializer(random_sign_init), gamma_initializer=make_sign_initializer(random_sign_init), activation=None, kernel_initializer='he_normal', - kernel_regularizer=tf.keras.regularizers.l2(l2), - bias_regularizer=tf.keras.regularizers.l2(l2), + kernel_regularizer=tf.python.keras.regularizers.l2(l2), + bias_regularizer=tf.python.keras.regularizers.l2(l2), ensemble_size=ensemble_size)(x) - return tf.keras.Model(inputs=inputs, outputs=x) + return tf.python.keras.Model(inputs=inputs, outputs=x) diff --git a/experimental/marginalization_mixup/deterministic.py b/experimental/marginalization_mixup/deterministic.py index 6ae18990..708cafc3 100644 --- a/experimental/marginalization_mixup/deterministic.py +++ b/experimental/marginalization_mixup/deterministic.py @@ -84,18 +84,18 @@ FLAGS = flags.FLAGS BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9) Conv2D = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.Conv2D, + tf.python.keras.layers.Conv2D, kernel_size=3, padding='same', use_bias=False, kernel_initializer='he_normal') -class DistanceMax(tf.keras.layers.Layer): +class DistanceMax(tf.python.keras.layers.Layer): r"""Implements the output layer of model for Distinction Maximization Loss. In Distinction Maximization loss, the logits produced by the output layer of @@ -138,29 +138,29 @@ def basic_block(inputs, filters, strides, l2, version): x = inputs y = inputs if version == 2: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=strides, - kernel_regularizer=tf.keras.regularizers.l2(l2))(y) - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=1, - kernel_regularizer=tf.keras.regularizers.l2(l2))(y) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(y) if version == 1: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) if not x.shape.is_compatible_with(y.shape): x = Conv2D(filters, kernel_size=1, strides=strides, - kernel_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.add([x, y]) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.add([x, y]) if version == 1: - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) return x @@ -195,19 +195,19 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, distance_logits: Bool, whether to use distance logits. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = Conv2D(16, strides=1, - kernel_regularizer=tf.keras.regularizers.l2(l2))(inputs) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(inputs) if version == 1: - x = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.Activation('relu')(x) + x = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.Activation('relu')(x) x = group(x, filters=16 * width_multiplier, strides=1, @@ -227,20 +227,20 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, l2=l2, version=version) if version == 2: - x = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) if distance_logits: x = DistanceMax(num_classes=num_classes)(x) else: - x = tf.keras.layers.Dense( + x = tf.python.keras.layers.Dense( num_classes, kernel_initializer='he_normal', - kernel_regularizer=tf.keras.regularizers.l2(l2), - bias_regularizer=tf.keras.regularizers.l2(l2))(x) - return tf.keras.Model(inputs=inputs, outputs=x) + kernel_regularizer=tf.python.keras.regularizers.l2(l2), + bias_regularizer=tf.python.keras.regularizers.l2(l2))(x) + return tf.python.keras.Model(inputs=inputs, outputs=x) def one_vs_rest_dm_loss(labels, logits, dm_alpha=1.): @@ -348,8 +348,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -376,16 +376,16 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.corruptions_interval > 0: @@ -394,9 +394,9 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -428,7 +428,7 @@ def step_fn(inputs): if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = sum(model.losses) @@ -465,7 +465,7 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) probs = tf.nn.softmax(logits) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( diff --git a/experimental/marginalization_mixup/dropout.py b/experimental/marginalization_mixup/dropout.py index 1e290f6b..96722878 100644 --- a/experimental/marginalization_mixup/dropout.py +++ b/experimental/marginalization_mixup/dropout.py @@ -209,8 +209,8 @@ def main(argv): num_classes = ds_info.features['label'].num_classes if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -239,16 +239,16 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.corruptions_interval > 0: @@ -257,9 +257,9 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -301,12 +301,12 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) if FLAGS.mixup_alpha > 0: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.categorical_crossentropy(labels, + tf.python.keras.losses.categorical_crossentropy(labels, logits, from_logits=True)) else: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = sum(model.losses) @@ -360,7 +360,7 @@ def step_fn(inputs): labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_dropout_samples, labels.shape[0]]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits_list, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0]) + @@ -387,7 +387,7 @@ def step_fn(inputs): else: strategy.run(step_fn, args=(next(iterator),)) - metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) + metrics.update({'test/ms_per_example': tf.python.keras.metrics.Mean()}) train_iterator = iter(train_dataset) forget_counts_history = [] diff --git a/experimental/marginalization_mixup/ensemble_layers.py b/experimental/marginalization_mixup/ensemble_layers.py index 4f852cc1..2ef6359d 100644 --- a/experimental/marginalization_mixup/ensemble_layers.py +++ b/experimental/marginalization_mixup/ensemble_layers.py @@ -19,15 +19,15 @@ import tensorflow as tf -class BatchEnsembleDEConv2D(tf.keras.layers.Layer): +class BatchEnsembleDEConv2D(tf.python.keras.layers.Layer): """A batch ensemble convolutional transpose layer.""" def __init__(self, filters, kernel_size, num_models=4, - alpha_initializer=tf.keras.initializers.Ones(), - gamma_initializer=tf.keras.initializers.Ones(), + alpha_initializer=tf.python.keras.initializers.Ones(), + gamma_initializer=tf.python.keras.initializers.Ones(), strides=(1, 1), padding="valid", data_format="channels_last", @@ -49,8 +49,8 @@ def __init__(self, self.alpha_initializer = alpha_initializer self.gamma_initializer = gamma_initializer self.use_bias = use_bias - self.activation = tf.keras.activations.get(activation) - self.deconv2d = tf.keras.layers.Conv2DTranspose( + self.activation = tf.python.keras.activations.get(activation) + self.deconv2d = tf.python.keras.layers.Conv2DTranspose( filters=filters, kernel_size=kernel_size, strides=strides, @@ -89,7 +89,7 @@ def build(self, input_shape): self.bias = self.add_weight( name="bias", shape=[self.num_models, self.filters], - initializer=tf.keras.initializers.Zeros(), + initializer=tf.python.keras.initializers.Zeros(), trainable=True, dtype=self.dtype) else: diff --git a/experimental/marginalization_mixup/naive_ensembles.py b/experimental/marginalization_mixup/naive_ensembles.py index 080cf694..e81caec1 100644 --- a/experimental/marginalization_mixup/naive_ensembles.py +++ b/experimental/marginalization_mixup/naive_ensembles.py @@ -89,18 +89,18 @@ FLAGS = flags.FLAGS BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9) Conv2D = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.Conv2D, + tf.python.keras.layers.Conv2D, kernel_size=3, padding='same', use_bias=False, kernel_initializer='he_normal') -class DistanceMax(tf.keras.layers.Layer): +class DistanceMax(tf.python.keras.layers.Layer): r"""Implements the output layer of model for Distinction Maximization Loss. In Distinction Maximization loss, the logits produced by the output layer of @@ -126,7 +126,7 @@ def call(self, inputs): return -1.0 * distances -class NaiveEnsemble(tf.keras.Model): +class NaiveEnsemble(tf.python.keras.Model): """A keras model wrapper for naive ensembles.""" def __init__(self, models, output_dirs=None): @@ -173,29 +173,29 @@ def basic_block(inputs, filters, strides, l2, version): x = inputs y = inputs if version == 2: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=strides, - kernel_regularizer=tf.keras.regularizers.l2(l2))(y) - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) - y = tf.keras.layers.Activation('relu')(y) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=1, - kernel_regularizer=tf.keras.regularizers.l2(l2))(y) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(y) if version == 1: - y = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(y) + y = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(y) if not x.shape.is_compatible_with(y.shape): x = Conv2D(filters, kernel_size=1, strides=strides, - kernel_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.add([x, y]) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.add([x, y]) if version == 1: - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) return x @@ -230,19 +230,19 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, distance_logits: Bool, whether to use distance logits. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = Conv2D(16, strides=1, - kernel_regularizer=tf.keras.regularizers.l2(l2))(inputs) + kernel_regularizer=tf.python.keras.regularizers.l2(l2))(inputs) if version == 1: - x = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.Activation('relu')(x) + x = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.Activation('relu')(x) x = group(x, filters=16 * width_multiplier, strides=1, @@ -262,20 +262,20 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, l2=l2, version=version) if version == 2: - x = BatchNormalization(beta_regularizer=tf.keras.regularizers.l2(l2), - gamma_regularizer=tf.keras.regularizers.l2(l2))(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = BatchNormalization(beta_regularizer=tf.python.keras.regularizers.l2(l2), + gamma_regularizer=tf.python.keras.regularizers.l2(l2))(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) if distance_logits: x = DistanceMax(num_classes=num_classes)(x) else: - x = tf.keras.layers.Dense( + x = tf.python.keras.layers.Dense( num_classes, kernel_initializer='he_normal', - kernel_regularizer=tf.keras.regularizers.l2(l2), - bias_regularizer=tf.keras.regularizers.l2(l2))(x) - return tf.keras.Model(inputs=inputs, outputs=x) + kernel_regularizer=tf.python.keras.regularizers.l2(l2), + bias_regularizer=tf.python.keras.regularizers.l2(l2))(x) + return tf.python.keras.Model(inputs=inputs, outputs=x) def one_vs_rest_dm_loss(labels, logits, dm_alpha=1.): @@ -402,8 +402,8 @@ def main(argv): num_classes = 2 if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -433,19 +433,19 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/member_accuracy_mean': ( - tf.keras.metrics.SparseCategoricalAccuracy()), + tf.python.keras.metrics.SparseCategoricalAccuracy()), 'test/member_ece_mean': um.ExpectedCalibrationError( num_bins=FLAGS.num_bins) } @@ -454,9 +454,9 @@ def main(argv): corrupt_diversity = {} if FLAGS.ensemble_size > 1: test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } if FLAGS.corruptions_interval > 0: @@ -465,21 +465,21 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) corrupt_metrics['test/member_acc_mean_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/member_ece_mean_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) corrupt_diversity['corrupt_diversity/average_kl_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() corrupt_diversity['corrupt_diversity/cosine_similarity_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() corrupt_diversity['corrupt_diversity/disagreement_{}'.format( - dataset_name)] = tf.keras.metrics.Mean() + dataset_name)] = tf.python.keras.metrics.Mean() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) @@ -525,12 +525,12 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) if FLAGS.mixup_alpha > 0: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.categorical_crossentropy(labels, + tf.python.keras.losses.categorical_crossentropy(labels, logits, from_logits=True)) else: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = sum(model.losses) @@ -587,7 +587,7 @@ def step_fn(inputs): probs = tf.reduce_mean(per_probs, axis=0) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) tiled_labels = tf.tile(labels, [FLAGS.ensemble_size]) tiled_probs = tf.concat(per_probs, axis=0) if dataset_name == 'clean': diff --git a/experimental/marginalization_mixup/sngp.py b/experimental/marginalization_mixup/sngp.py index 48ac4286..d97a4dd3 100644 --- a/experimental/marginalization_mixup/sngp.py +++ b/experimental/marginalization_mixup/sngp.py @@ -282,8 +282,8 @@ def main(argv): num_classes = ds_info.features['label'].num_classes if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -329,18 +329,18 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/stddev': tf.keras.metrics.Mean(), + 'test/stddev': tf.python.keras.metrics.Mean(), } if FLAGS.corruptions_interval > 0: corrupt_metrics = {} @@ -348,13 +348,13 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) corrupt_metrics['test/stddev_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) @@ -394,12 +394,12 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) if FLAGS.mixup_alpha > 0: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.categorical_crossentropy(labels, + tf.python.keras.losses.categorical_crossentropy(labels, logits, from_logits=True)) else: negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) @@ -463,7 +463,7 @@ def step_fn(inputs): labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_dropout_samples, labels.shape[0]]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits_list, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0]) + @@ -493,7 +493,7 @@ def step_fn(inputs): else: strategy.run(step_fn, args=(next(iterator),)) - metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) + metrics.update({'test/ms_per_example': tf.python.keras.metrics.Mean()}) train_iterator = iter(train_dataset) forget_counts_history = [] diff --git a/experimental/marginalization_mixup/temperature_scaling.py b/experimental/marginalization_mixup/temperature_scaling.py index 1b2e2f1f..9e1a8da3 100644 --- a/experimental/marginalization_mixup/temperature_scaling.py +++ b/experimental/marginalization_mixup/temperature_scaling.py @@ -121,15 +121,15 @@ def main(argv): steps_per_eval = ds_info.splits['test'].num_examples // batch_size if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building Keras model') - model = tf.keras.models.load_model(FLAGS.model_dir) + model = tf.python.keras.models.load_model(FLAGS.model_dir) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) @@ -140,11 +140,11 @@ def main(argv): temperature_corrupt_metrics = [] for _ in temperatures: metrics = { - 'val/negative_log_likelihood': tf.keras.metrics.Mean(), - 'val/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'val/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'val/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'val/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } temperature_metrics.append(metrics) @@ -154,9 +154,9 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) temperature_corrupt_metrics.append(corrupt_metrics) @@ -188,7 +188,7 @@ def step_fn(inputs): if FLAGS.ensemble_then_calibrate: probs = tf.nn.softmax(tf.math.log(probs) / temperature) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) if dataset_name == 'validation': metrics['val/negative_log_likelihood'].update_state( negative_log_likelihood) diff --git a/experimental/mimo/cifar.py b/experimental/mimo/cifar.py index 34b919bc..c34dd0ba 100644 --- a/experimental/mimo/cifar.py +++ b/experimental/mimo/cifar.py @@ -137,8 +137,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -163,15 +163,15 @@ def main(argv): FLAGS.lr_decay_ratio, lr_decay_epochs, FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD( + optimizer = tf.python.keras.optimizers.SGD( lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.corruptions_interval > 0: @@ -180,20 +180,20 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) @@ -235,7 +235,7 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean(tf.reduce_sum( - tf.keras.losses.sparse_categorical_crossentropy( + tf.python.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True), axis=1)) filtered_variables = [] for var in model.trainable_variables: @@ -287,7 +287,7 @@ def step_fn(inputs): for i in range(FLAGS.ensemble_size): member_probs = probs[:, i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -296,7 +296,7 @@ def step_fn(inputs): # Negative log marginal likelihood computed in a numerically-stable way. labels_tiled = tf.tile( tf.expand_dims(labels, 1), [1, FLAGS.ensemble_size]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_tiled, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[1]) + @@ -318,7 +318,7 @@ def step_fn(inputs): strategy.run(step_fn, args=(next(iterator),)) - metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) + metrics.update({'test/ms_per_example': tf.python.keras.metrics.Mean()}) train_iterator = iter(train_dataset) start_time = time.time() diff --git a/experimental/mimo/cifar_model.py b/experimental/mimo/cifar_model.py index cbded307..34ca351a 100644 --- a/experimental/mimo/cifar_model.py +++ b/experimental/mimo/cifar_model.py @@ -20,11 +20,11 @@ import tensorflow as tf BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9) Conv2D = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.Conv2D, + tf.python.keras.layers.Conv2D, kernel_size=3, padding='same', use_bias=False, @@ -37,15 +37,15 @@ def basic_block(inputs, filters, strides): x = inputs y = inputs y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=strides)(y) y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=1)(y) if not x.shape.is_compatible_with(y.shape): x = Conv2D(filters, kernel_size=1, strides=strides)(x) - x = tf.keras.layers.add([x, y]) + x = tf.python.keras.layers.add([x, y]) return x @@ -77,17 +77,17 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, ensemble_size: Number of ensemble members. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 input_shape = list(input_shape) - inputs = tf.keras.layers.Input(shape=input_shape) - x = tf.keras.layers.Permute([2, 3, 4, 1])(inputs) + inputs = tf.python.keras.layers.Input(shape=input_shape) + x = tf.python.keras.layers.Permute([2, 3, 4, 1])(inputs) if ensemble_size != input_shape[0]: raise ValueError('the first dimension of input_shape must be ensemble_size') - x = tf.keras.layers.Reshape(input_shape[1:-1] + + x = tf.python.keras.layers.Reshape(input_shape[1:-1] + [input_shape[-1] * ensemble_size])(x) x = Conv2D(16, strides=1)(x) for strides, filters in zip([1, 2, 2], [16, 32, 64]): @@ -98,12 +98,12 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, num_blocks=num_blocks) x = BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) x = layers.DenseMultihead( num_classes, kernel_initializer='he_normal', activation=None, ensemble_size=ensemble_size)(x) - return tf.keras.Model(inputs=inputs, outputs=x) + return tf.python.keras.Model(inputs=inputs, outputs=x) diff --git a/experimental/mimo/cifar_model_reg_path.py b/experimental/mimo/cifar_model_reg_path.py index 2b299c9b..901214a1 100644 --- a/experimental/mimo/cifar_model_reg_path.py +++ b/experimental/mimo/cifar_model_reg_path.py @@ -22,19 +22,19 @@ BATCHNORM_L2 = 3e-4 BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9, - beta_regularizer=tf.keras.regularizers.l2(BATCHNORM_L2), - gamma_regularizer=tf.keras.regularizers.l2(BATCHNORM_L2)) + beta_regularizer=tf.python.keras.regularizers.l2(BATCHNORM_L2), + gamma_regularizer=tf.python.keras.regularizers.l2(BATCHNORM_L2)) Conv2D = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.Conv2D, + tf.python.keras.layers.Conv2D, kernel_size=3, padding='same', use_bias=False, kernel_initializer='he_normal') -l1_l2 = tf.keras.regularizers.l1_l2 +l1_l2 = tf.python.keras.regularizers.l1_l2 def basic_block(inputs, filters, strides, l2=0., l1=0.): @@ -43,18 +43,18 @@ def basic_block(inputs, filters, strides, l2=0., l1=0.): x = inputs y = inputs y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=strides, kernel_regularizer=l1_l2(l1=l1, l2=l2))(y) y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2D(filters, strides=1, kernel_regularizer=l1_l2(l1=l1, l2=l2))(y) if not x.shape.is_compatible_with(y.shape): x = Conv2D(filters, kernel_size=1, strides=strides, kernel_regularizer=l1_l2(l1=l1, l2=l2))(x) - x = tf.keras.layers.add([x, y]) + x = tf.python.keras.layers.add([x, y]) return x @@ -89,17 +89,17 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, l1: L1 regularization value. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 input_shape = list(input_shape) - inputs = tf.keras.layers.Input(shape=input_shape) - x = tf.keras.layers.Permute([2, 3, 4, 1])(inputs) + inputs = tf.python.keras.layers.Input(shape=input_shape) + x = tf.python.keras.layers.Permute([2, 3, 4, 1])(inputs) if ensemble_size != input_shape[0]: raise ValueError('the first dimension of input_shape must be ensemble_size') - x = tf.keras.layers.Reshape(input_shape[1:-1] + + x = tf.python.keras.layers.Reshape(input_shape[1:-1] + [input_shape[-1] * ensemble_size])(x) # since the first conv layer and the last dense layer have ensemble_size more # weights, we multiply the regularization coefficients by that amount @@ -117,9 +117,9 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, l1=l1) x = BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) x = layers.DenseMultihead( num_classes, kernel_initializer='he_normal', @@ -127,4 +127,4 @@ def wide_resnet(input_shape, depth, width_multiplier, num_classes, ensemble_size=ensemble_size, kernel_regularizer=l1_l2(l1=scaled_l1, l2=scaled_l2), bias_regularizer=l1_l2(l1=scaled_l1, l2=scaled_l2))(x) - return tf.keras.Model(inputs=inputs, outputs=x) + return tf.python.keras.Model(inputs=inputs, outputs=x) diff --git a/experimental/mimo/cifar_model_test.py b/experimental/mimo/cifar_model_test.py index 0129e5a3..223a4fd0 100644 --- a/experimental/mimo/cifar_model_test.py +++ b/experimental/mimo/cifar_model_test.py @@ -45,7 +45,7 @@ def testCifarModel(self): ensemble_size=ensemble_size) model.compile( 'adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) + loss=tf.python.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) history = model.fit(dataset, steps_per_epoch=dataset_size // batch_size, epochs=2) diff --git a/experimental/mimo/cifar_reg_path.py b/experimental/mimo/cifar_reg_path.py index ab0353ab..b8568cd5 100644 --- a/experimental/mimo/cifar_reg_path.py +++ b/experimental/mimo/cifar_reg_path.py @@ -149,8 +149,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -176,20 +176,20 @@ def main(argv): FLAGS.lr_decay_ratio, lr_decay_epochs, FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD( + optimizer = tf.python.keras.optimizers.SGD( lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for log10_threshold in NNZ_LOG10_THRESHOLDS: - metrics['train/nnz{}'.format(log10_threshold)] = tf.keras.metrics.Mean() + metrics['train/nnz{}'.format(log10_threshold)] = tf.python.keras.metrics.Mean() if FLAGS.corruptions_interval > 0: corrupt_metrics = {} @@ -197,20 +197,20 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) @@ -252,7 +252,7 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean(tf.reduce_sum( - tf.keras.losses.sparse_categorical_crossentropy( + tf.python.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True), axis=1)) regularization = sum(model.losses) @@ -301,7 +301,7 @@ def step_fn(inputs): for i in range(FLAGS.ensemble_size): member_probs = probs[:, i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -310,7 +310,7 @@ def step_fn(inputs): # Negative log marginal likelihood computed in a numerically-stable way. labels_tiled = tf.tile( tf.expand_dims(labels, 1), [1, FLAGS.ensemble_size]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_tiled, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[1]) + @@ -332,7 +332,7 @@ def step_fn(inputs): strategy.run(step_fn, args=(next(iterator),)) - metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) + metrics.update({'test/ms_per_example': tf.python.keras.metrics.Mean()}) train_iterator = iter(train_dataset) start_time = time.time() diff --git a/experimental/mimo/imagenet.py b/experimental/mimo/imagenet.py index d5a8fa28..5ae180b8 100644 --- a/experimental/mimo/imagenet.py +++ b/experimental/mimo/imagenet.py @@ -106,8 +106,8 @@ def main(argv): test_dataset = strategy.experimental_distribute_dataset(test_dataset) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) with strategy.scope(): logging.info('Building Keras ResNet-50 model') @@ -131,27 +131,27 @@ def main(argv): base_lr, FLAGS.train_epochs, lr_schedule) - optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, + optimizer = tf.python.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } logging.info('Finished building Keras ResNet-50 model') @@ -195,7 +195,7 @@ def step_fn(inputs): logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean(tf.reduce_sum( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True), axis=1)) @@ -246,7 +246,7 @@ def step_fn(inputs): for i in range(FLAGS.ensemble_size): member_probs = probs[:, i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -255,7 +255,7 @@ def step_fn(inputs): # Negative log marginal likelihood computed in a numerically-stable way. labels_tiled = tf.tile( tf.expand_dims(labels, 1), [1, FLAGS.ensemble_size]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_tiled, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[1]) + @@ -269,7 +269,7 @@ def step_fn(inputs): strategy.run(step_fn, args=(next(iterator),)) - metrics.update({'test/ms_per_example': tf.keras.metrics.Mean()}) + metrics.update({'test/ms_per_example': tf.python.keras.metrics.Mean()}) train_iterator = iter(train_dataset) start_time = time.time() diff --git a/experimental/mimo/imagenet_model.py b/experimental/mimo/imagenet_model.py index e9ca9ca3..09b55a15 100644 --- a/experimental/mimo/imagenet_model.py +++ b/experimental/mimo/imagenet_model.py @@ -49,19 +49,19 @@ def bottleneck_block(inputs, conv_name_base = 'res' + str(stage) + block + '_branch' bn_name_base = 'bn' + str(stage) + block + '_branch' - x = tf.keras.layers.Conv2D( + x = tf.python.keras.layers.Conv2D( filters1, kernel_size=1, use_bias=False, kernel_initializer='he_normal', name=conv_name_base + '2a')(inputs) - x = tf.keras.layers.BatchNormalization( + x = tf.python.keras.layers.BatchNormalization( momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base + '2a')(x) - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) - x = tf.keras.layers.Conv2D( + x = tf.python.keras.layers.Conv2D( filters2, kernel_size=3, strides=strides, @@ -69,39 +69,39 @@ def bottleneck_block(inputs, use_bias=False, kernel_initializer='he_normal', name=conv_name_base + '2b')(x) - x = tf.keras.layers.BatchNormalization( + x = tf.python.keras.layers.BatchNormalization( momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base + '2b')(x) - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) - x = tf.keras.layers.Conv2D( + x = tf.python.keras.layers.Conv2D( filters3, kernel_size=1, use_bias=False, kernel_initializer='he_normal', name=conv_name_base + '2c')(x) - x = tf.keras.layers.BatchNormalization( + x = tf.python.keras.layers.BatchNormalization( momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base + '2c')(x) shortcut = inputs if not x.shape.is_compatible_with(shortcut.shape): - shortcut = tf.keras.layers.Conv2D( + shortcut = tf.python.keras.layers.Conv2D( filters3, kernel_size=1, use_bias=False, strides=strides, kernel_initializer='he_normal', name=conv_name_base + '1')(shortcut) - shortcut = tf.keras.layers.BatchNormalization( + shortcut = tf.python.keras.layers.BatchNormalization( momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base + '1')(shortcut) - x = tf.keras.layers.add([x, shortcut]) - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.add([x, shortcut]) + x = tf.python.keras.layers.Activation('relu')(x) return x @@ -127,17 +127,17 @@ def resnet50(input_shape, num_classes, ensemble_size, width_multiplier=1): width_multiplier: Multiply the number of filters for wide ResNet. Returns: - tf.keras.Model. + tf.python.keras.Model. """ input_shape = list(input_shape) - inputs = tf.keras.layers.Input(shape=input_shape) - x = tf.keras.layers.Permute([2, 3, 4, 1])(inputs) + inputs = tf.python.keras.layers.Input(shape=input_shape) + x = tf.python.keras.layers.Permute([2, 3, 4, 1])(inputs) assert ensemble_size == input_shape[0] - x = tf.keras.layers.Reshape(list(input_shape[1:-1]) + + x = tf.python.keras.layers.Reshape(list(input_shape[1:-1]) + [input_shape[-1] * ensemble_size])( x) - x = tf.keras.layers.ZeroPadding2D(padding=3, name='conv1_pad')(x) - x = tf.keras.layers.Conv2D( + x = tf.python.keras.layers.ZeroPadding2D(padding=3, name='conv1_pad')(x) + x = tf.python.keras.layers.Conv2D( width_multiplier * 64, kernel_size=7, strides=2, @@ -145,12 +145,12 @@ def resnet50(input_shape, num_classes, ensemble_size, width_multiplier=1): use_bias=False, kernel_initializer='he_normal', name='conv1')(x) - x = tf.keras.layers.BatchNormalization( + x = tf.python.keras.layers.BatchNormalization( momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name='bn_conv1')(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.MaxPooling2D(3, strides=2, padding='same')(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.MaxPooling2D(3, strides=2, padding='same')(x) x = group(x, [width_multiplier * 64, width_multiplier * 64, width_multiplier * 256], stage=2, num_blocks=3, strides=1) @@ -163,11 +163,11 @@ def resnet50(input_shape, num_classes, ensemble_size, width_multiplier=1): x = group(x, [width_multiplier * 512, width_multiplier * 512, width_multiplier * 2048], stage=5, num_blocks=3, strides=2) - x = tf.keras.layers.GlobalAveragePooling2D(name='avg_pool')(x) + x = tf.python.keras.layers.GlobalAveragePooling2D(name='avg_pool')(x) x = layers.DenseMultihead( num_classes, activation=None, - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), + kernel_initializer=tf.python.keras.initializers.RandomNormal(stddev=0.01), ensemble_size=ensemble_size, name='fc1000')(x) - return tf.keras.Model(inputs=inputs, outputs=x, name='resnet50') + return tf.python.keras.Model(inputs=inputs, outputs=x, name='resnet50') diff --git a/experimental/mimo/imagenet_model_test.py b/experimental/mimo/imagenet_model_test.py index 4dce6c3f..de8cb7e4 100644 --- a/experimental/mimo/imagenet_model_test.py +++ b/experimental/mimo/imagenet_model_test.py @@ -44,7 +44,7 @@ def testImageNetModel(self): ensemble_size=ensemble_size) model.compile( 'adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) + loss=tf.python.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) history = model.fit(dataset, steps_per_epoch=dataset_size // batch_size, epochs=2) diff --git a/experimental/mimo/layers.py b/experimental/mimo/layers.py index 888104cb..116cf59b 100644 --- a/experimental/mimo/layers.py +++ b/experimental/mimo/layers.py @@ -19,7 +19,7 @@ # TODO(trandustin): Move into ed.layers. -class DenseMultihead(tf.keras.layers.Dense): +class DenseMultihead(tf.python.keras.layers.Dense): """Multiheaded output layer.""" def __init__(self, diff --git a/experimental/rank1_bnns/cifar-refined-vi.py b/experimental/rank1_bnns/cifar-refined-vi.py index 930db3ac..c4526146 100644 --- a/experimental/rank1_bnns/cifar-refined-vi.py +++ b/experimental/rank1_bnns/cifar-refined-vi.py @@ -179,8 +179,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -217,41 +217,41 @@ def main(argv): warmup_epochs=FLAGS.lr_warmup_epochs, train_epochs=FLAGS.train_epochs, refining_learning_rate=FLAGS.refining_learning_rate) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/kl': tf.keras.metrics.Mean(), - 'train/kl_scale': tf.keras.metrics.Mean(), - 'train/elbo': tf.keras.metrics.Mean(), - 'train/loss': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/kl': tf.python.keras.metrics.Mean(), + 'train/kl_scale': tf.python.keras.metrics.Mean(), + 'train/elbo': tf.python.keras.metrics.Mean(), + 'train/loss': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/kl': tf.keras.metrics.Mean(), - 'test/elbo': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/kl': tf.python.keras.metrics.Mean(), + 'test/elbo': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.ensemble_size > 1: for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) if FLAGS.corruptions_interval > 0: corrupt_metrics = {} for intensity in range(1, max_intensity + 1): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/kl_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/elbo_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -294,7 +294,7 @@ def step_fn(inputs): if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = compute_l2_loss(model) @@ -360,7 +360,7 @@ def step_fn(inputs): per_probs = tf.reduce_mean(probs, axis=0) # marginalize samples for i in range(FLAGS.ensemble_size): member_probs = per_probs[i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -370,7 +370,7 @@ def step_fn(inputs): labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_eval_samples, FLAGS.ensemble_size, labels.shape[0]]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0, 1]) + diff --git a/experimental/rank1_bnns/cifar.py b/experimental/rank1_bnns/cifar.py index 43d12acc..891b3f5b 100644 --- a/experimental/rank1_bnns/cifar.py +++ b/experimental/rank1_bnns/cifar.py @@ -159,8 +159,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -195,41 +195,41 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD(lr_schedule, + optimizer = tf.python.keras.optimizers.SGD(lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/kl': tf.keras.metrics.Mean(), - 'train/kl_scale': tf.keras.metrics.Mean(), - 'train/elbo': tf.keras.metrics.Mean(), - 'train/loss': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/kl': tf.python.keras.metrics.Mean(), + 'train/kl_scale': tf.python.keras.metrics.Mean(), + 'train/elbo': tf.python.keras.metrics.Mean(), + 'train/loss': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/kl': tf.keras.metrics.Mean(), - 'test/elbo': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/kl': tf.python.keras.metrics.Mean(), + 'test/elbo': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.ensemble_size > 1: for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) if FLAGS.corruptions_interval > 0: corrupt_metrics = {} for intensity in range(1, max_intensity + 1): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/kl_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/elbo_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -272,7 +272,7 @@ def step_fn(inputs): if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = compute_l2_loss(model) @@ -338,7 +338,7 @@ def step_fn(inputs): per_probs = tf.reduce_mean(probs, axis=0) # marginalize samples for i in range(FLAGS.ensemble_size): member_probs = per_probs[i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -348,7 +348,7 @@ def step_fn(inputs): labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_eval_samples, FLAGS.ensemble_size, labels.shape[0]]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0, 1]) + diff --git a/experimental/rank1_bnns/cifar_model.py b/experimental/rank1_bnns/cifar_model.py index 6d9ad57b..105fc185 100644 --- a/experimental/rank1_bnns/cifar_model.py +++ b/experimental/rank1_bnns/cifar_model.py @@ -21,7 +21,7 @@ import tensorflow as tf BatchNormalization = functools.partial( # pylint: disable=invalid-name - tf.keras.layers.BatchNormalization, + tf.python.keras.layers.BatchNormalization, epsilon=1e-5, # using epsilon and momentum defaults from Torch momentum=0.9) Conv2DRank1 = functools.partial( # pylint: disable=invalid-name @@ -74,7 +74,7 @@ def basic_block(inputs, x = inputs y = inputs y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2DRank1( filters, strides=strides, @@ -91,7 +91,7 @@ def basic_block(inputs, use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size)(y) y = BatchNormalization()(y) - y = tf.keras.layers.Activation('relu')(y) + y = tf.python.keras.layers.Activation('relu')(y) y = Conv2DRank1( filters, strides=1, @@ -124,7 +124,7 @@ def basic_block(inputs, gamma_regularizer, prior_mean, prior_stddev), use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size)(x) - x = tf.keras.layers.add([x, y]) + x = tf.python.keras.layers.add([x, y]) return x @@ -182,12 +182,12 @@ def wide_resnet(input_shape, prior_stddev: Standard deviation of the prior. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 4) % 6 != 0: raise ValueError('depth should be 6n+4 (e.g., 16, 22, 28, 40).') num_blocks = (depth - 4) // 6 - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = Conv2DRank1( 16, strides=1, @@ -220,9 +220,9 @@ def wide_resnet(input_shape, prior_stddev=prior_stddev) x = BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) x = ed.layers.DenseRank1( num_classes, alpha_initializer=utils.make_initializer(alpha_initializer, @@ -239,4 +239,4 @@ def wide_resnet(input_shape, gamma_regularizer, prior_mean, prior_stddev), use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size)(x) - return tf.keras.Model(inputs=inputs, outputs=x) + return tf.python.keras.Model(inputs=inputs, outputs=x) diff --git a/experimental/rank1_bnns/imagenet.py b/experimental/rank1_bnns/imagenet.py index e01c1848..089c77ea 100644 --- a/experimental/rank1_bnns/imagenet.py +++ b/experimental/rank1_bnns/imagenet.py @@ -143,8 +143,8 @@ def main(argv): strategy.experimental_distribute_dataset(dataset)) if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -173,21 +173,21 @@ def main(argv): base_lr, FLAGS.train_epochs, _LR_SCHEDULE) - optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, + optimizer = tf.python.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/kl': tf.keras.metrics.Mean(), - 'train/kl_scale': tf.keras.metrics.Mean(), - 'train/elbo': tf.keras.metrics.Mean(), - 'train/loss': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/kl': tf.python.keras.metrics.Mean(), + 'train/kl_scale': tf.python.keras.metrics.Mean(), + 'train/elbo': tf.python.keras.metrics.Mean(), + 'train/loss': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/kl': tf.keras.metrics.Mean(), - 'test/elbo': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/kl': tf.python.keras.metrics.Mean(), + 'test/elbo': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } if FLAGS.corruptions_interval > 0: @@ -196,13 +196,13 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/kl_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/elbo_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -210,18 +210,18 @@ def main(argv): training_diversity = {} if FLAGS.ensemble_size > 1: for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } training_diversity = { - 'train/disagreement': tf.keras.metrics.Mean(), - 'train/average_kl': tf.keras.metrics.Mean(), - 'train/cosine_similarity': tf.keras.metrics.Mean(), + 'train/disagreement': tf.python.keras.metrics.Mean(), + 'train/average_kl': tf.python.keras.metrics.Mean(), + 'train/cosine_similarity': tf.python.keras.metrics.Mean(), } logging.info('Finished building Keras ResNet-50 model') @@ -273,7 +273,7 @@ def step_fn(inputs): per_probs, FLAGS.ensemble_size) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, + tf.python.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) l2_loss = compute_l2_loss(model) @@ -340,7 +340,7 @@ def step_fn(inputs): labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_eval_samples, FLAGS.ensemble_size, labels.shape[0]]) - log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( + log_likelihoods = -tf.python.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0, 1]) + @@ -359,7 +359,7 @@ def step_fn(inputs): test_diversity['test/' + k].update_state(v) for i in range(FLAGS.ensemble_size): member_probs = per_probs[i] - member_loss = tf.keras.losses.sparse_categorical_crossentropy( + member_loss = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( diff --git a/experimental/rank1_bnns/imagenet_model.py b/experimental/rank1_bnns/imagenet_model.py index c57caefb..1287dfc0 100644 --- a/experimental/rank1_bnns/imagenet_model.py +++ b/experimental/rank1_bnns/imagenet_model.py @@ -102,7 +102,7 @@ def bottleneck_block(inputs, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base+'2a') - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) x = ed.layers.Conv2DRank1( filters2, @@ -131,7 +131,7 @@ def bottleneck_block(inputs, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name=bn_name_base+'2b') - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.Activation('relu')(x) x = ed.layers.Conv2DRank1( filters3, @@ -188,8 +188,8 @@ def bottleneck_block(inputs, epsilon=BATCH_NORM_EPSILON, name=bn_name_base+'1') - x = tf.keras.layers.add([x, shortcut]) - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.add([x, shortcut]) + x = tf.python.keras.layers.Activation('relu')(x) return x @@ -269,7 +269,7 @@ def rank1_resnet50(input_shape, use_tpu: whether the model runs on TPU. Returns: - tf.keras.Model. + tf.python.keras.Model. """ group_ = functools.partial( group, @@ -283,8 +283,8 @@ def rank1_resnet50(input_shape, dropout_rate=dropout_rate, prior_stddev=prior_stddev, use_tpu=use_tpu) - inputs = tf.keras.layers.Input(shape=input_shape) - x = tf.keras.layers.ZeroPadding2D(padding=3, name='conv1_pad')(inputs) + inputs = tf.python.keras.layers.Input(shape=input_shape) + x = tf.python.keras.layers.ZeroPadding2D(padding=3, name='conv1_pad')(inputs) x = ed.layers.Conv2DRank1( 64, kernel_size=7, @@ -312,13 +312,13 @@ def rank1_resnet50(input_shape, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, name='bn_conv1') - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.MaxPooling2D(3, strides=(2, 2), padding='same')(x) + x = tf.python.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.MaxPooling2D(3, strides=(2, 2), padding='same')(x) x = group_(x, [64, 64, 256], stage=2, num_blocks=3, strides=1) x = group_(x, [128, 128, 512], stage=3, num_blocks=4, strides=2) x = group_(x, [256, 256, 1024], stage=4, num_blocks=6, strides=2) x = group_(x, [512, 512, 2048], stage=5, num_blocks=3, strides=2) - x = tf.keras.layers.GlobalAveragePooling2D(name='avg_pool')(x) + x = tf.python.keras.layers.GlobalAveragePooling2D(name='avg_pool')(x) x = ed.layers.DenseRank1( num_classes, alpha_initializer=utils.make_initializer(alpha_initializer, @@ -327,7 +327,7 @@ def rank1_resnet50(input_shape, gamma_initializer=utils.make_initializer(gamma_initializer, random_sign_init, dropout_rate), - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), + kernel_initializer=tf.python.keras.initializers.RandomNormal(stddev=0.01), alpha_regularizer=utils.make_regularizer( alpha_regularizer, 1., prior_stddev), gamma_regularizer=utils.make_regularizer( @@ -336,4 +336,4 @@ def rank1_resnet50(input_shape, ensemble_size=ensemble_size, activation=None, name='fc1000')(x) - return tf.keras.Model(inputs=inputs, outputs=x, name='resnet50') + return tf.python.keras.Model(inputs=inputs, outputs=x, name='resnet50') diff --git a/experimental/rank1_bnns/resnet_cifar_main.py b/experimental/rank1_bnns/resnet_cifar_main.py index d91051e6..14017088 100644 --- a/experimental/rank1_bnns/resnet_cifar_main.py +++ b/experimental/rank1_bnns/resnet_cifar_main.py @@ -179,8 +179,8 @@ def main(argv): steps_per_eval = test_dataset_size // batch_size_eval if FLAGS.use_bfloat16: - policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') - tf.keras.mixed_precision.experimental.set_policy(policy) + policy = tf.python.keras.mixed_precision.experimental.Policy('mixed_bfloat16') + tf.python.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) @@ -210,17 +210,17 @@ def main(argv): decay_ratio=FLAGS.lr_decay_ratio, decay_epochs=lr_decay_epochs, warmup_epochs=FLAGS.lr_warmup_epochs) - optimizer = tf.keras.optimizers.SGD( + optimizer = tf.python.keras.optimizers.SGD( lr_schedule, momentum=0.9, nesterov=True) metrics = { - 'train/negative_log_likelihood': tf.keras.metrics.Mean(), - 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), - 'train/loss': tf.keras.metrics.Mean(), + 'train/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'train/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), + 'train/loss': tf.python.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/negative_log_likelihood': tf.keras.metrics.Mean(), - 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), + 'test/negative_log_likelihood': tf.python.keras.metrics.Mean(), + 'test/accuracy': tf.python.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), - 'test/loss': tf.keras.metrics.Mean(), + 'test/loss': tf.python.keras.metrics.Mean(), } if FLAGS.corruptions_interval > 0: corrupt_metrics = {} @@ -228,9 +228,9 @@ def main(argv): for corruption in corruption_types: dataset_name = '{0}_{1}'.format(corruption, intensity) corrupt_metrics['test/nll_{}'.format(dataset_name)] = ( - tf.keras.metrics.Mean()) + tf.python.keras.metrics.Mean()) corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format(dataset_name)] = ( um.ExpectedCalibrationError(num_bins=FLAGS.num_bins)) @@ -238,18 +238,18 @@ def main(argv): training_diversity = {} if FLAGS.ensemble_size > 1: for i in range(FLAGS.ensemble_size): - metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean() + metrics['test/nll_member_{}'.format(i)] = tf.python.keras.metrics.Mean() metrics['test/accuracy_member_{}'.format(i)] = ( - tf.keras.metrics.SparseCategoricalAccuracy()) + tf.python.keras.metrics.SparseCategoricalAccuracy()) test_diversity = { - 'test/disagreement': tf.keras.metrics.Mean(), - 'test/average_kl': tf.keras.metrics.Mean(), - 'test/cosine_similarity': tf.keras.metrics.Mean(), + 'test/disagreement': tf.python.keras.metrics.Mean(), + 'test/average_kl': tf.python.keras.metrics.Mean(), + 'test/cosine_similarity': tf.python.keras.metrics.Mean(), } training_diversity = { - 'train/disagreement': tf.keras.metrics.Mean(), - 'train/average_kl': tf.keras.metrics.Mean(), - 'train/cosine_similarity': tf.keras.metrics.Mean(), + 'train/disagreement': tf.python.keras.metrics.Mean(), + 'train/average_kl': tf.python.keras.metrics.Mean(), + 'train/cosine_similarity': tf.python.keras.metrics.Mean(), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) @@ -310,7 +310,7 @@ def step_fn(inputs): probs = tf.reduce_mean(probs, 0) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) filtered_variables = [] for var in model.trainable_variables: @@ -395,7 +395,7 @@ def step_fn(inputs): for i in range(FLAGS.ensemble_size): member_probs = per_probs[i] - member_nll = tf.keras.losses.sparse_categorical_crossentropy( + member_nll = tf.python.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state(member_nll) metrics['test/accuracy_member_{}'.format(i)].update_state( @@ -404,7 +404,7 @@ def step_fn(inputs): probs = tf.reduce_mean(per_probs, axis=0) negative_log_likelihood = tf.reduce_mean( - tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) + tf.python.keras.losses.sparse_categorical_crossentropy(labels, probs)) filtered_variables = [] for var in model.trainable_variables: if 'kernel' in var.name or 'bias' in var.name: diff --git a/experimental/rank1_bnns/resnet_cifar_model.py b/experimental/rank1_bnns/resnet_cifar_model.py index c0bc39ec..bac42398 100644 --- a/experimental/rank1_bnns/resnet_cifar_model.py +++ b/experimental/rank1_bnns/resnet_cifar_model.py @@ -50,7 +50,7 @@ def rank1_resnet_layer(inputs, filters: Number of filters for Conv2D. kernel_size: Kernel dimensions for Conv2D. strides: Stride dimensinons for Conv2D. - activation: tf.keras.activations.Activation. + activation: tf.python.keras.activations.Activation. alpha_initializer: The initializer for the alpha parameters. gamma_initializer: The initializer for the gamma parameters. alpha_regularizer: The regularizer for the alpha parameters. @@ -87,10 +87,10 @@ def rank1_resnet_layer(inputs, gamma_regularizer=gamma_regularizer, use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size)(x) - x = tf.keras.layers.BatchNormalization(epsilon=BATCH_NORM_EPSILON, + x = tf.python.keras.layers.BatchNormalization(epsilon=BATCH_NORM_EPSILON, momentum=BATCH_NORM_DECAY)(x) if activation is not None: - x = tf.keras.layers.Activation(activation)(x) + x = tf.python.keras.layers.Activation(activation)(x) return x @@ -129,7 +129,7 @@ def rank1_resnet_v1(input_shape, dropout_rate: Dropout rate. Returns: - tf.keras.Model. + tf.python.keras.Model. """ if (depth - 2) % 6 != 0: raise ValueError('depth should be 6n+2 (e.g., 20, 32, 44).') @@ -146,7 +146,7 @@ def rank1_resnet_v1(input_shape, ensemble_size=ensemble_size, random_sign_init=random_sign_init, dropout_rate=dropout_rate) - inputs = tf.keras.layers.Input(shape=input_shape) + inputs = tf.python.keras.layers.Input(shape=input_shape) x = resnet_layer(inputs, filters=filters, kernel_size=3, @@ -178,13 +178,13 @@ def rank1_resnet_v1(input_shape, kernel_size=1, strides=strides, activation=None) - x = tf.keras.layers.add([x, y]) - x = tf.keras.layers.Activation('relu')(x) + x = tf.python.keras.layers.add([x, y]) + x = tf.python.keras.layers.Activation('relu')(x) filters *= 2 # v1 does not use BN after last shortcut connection-ReLU - x = tf.keras.layers.AveragePooling2D(pool_size=8)(x) - x = tf.keras.layers.Flatten()(x) + x = tf.python.keras.layers.AveragePooling2D(pool_size=8)(x) + x = tf.python.keras.layers.Flatten()(x) x = ed.layers.DenseRank1( num_classes, activation=None, @@ -199,5 +199,5 @@ def rank1_resnet_v1(input_shape, gamma_regularizer=gamma_regularizer, use_additive_perturbation=use_additive_perturbation, ensemble_size=ensemble_size)(x) - model = tf.keras.Model(inputs=inputs, outputs=x) + model = tf.python.keras.Model(inputs=inputs, outputs=x) return model diff --git a/experimental/rank1_bnns/resnet_cifar_model_test.py b/experimental/rank1_bnns/resnet_cifar_model_test.py index f7a8e30b..0a6fd189 100644 --- a/experimental/rank1_bnns/resnet_cifar_model_test.py +++ b/experimental/rank1_bnns/resnet_cifar_model_test.py @@ -70,7 +70,7 @@ def testRank1ResNetV1(self, dropout_rate=0.) model.compile( 'adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) + loss=tf.python.keras.losses.SparseCategoricalCrossentropy(from_logits=True)) history = model.fit(dataset, steps_per_epoch=dataset_size // batch_size, epochs=2) diff --git a/experimental/rank1_bnns/utils.py b/experimental/rank1_bnns/utils.py index 1f9ca9d8..c8523a8e 100644 --- a/experimental/rank1_bnns/utils.py +++ b/experimental/rank1_bnns/utils.py @@ -24,7 +24,7 @@ def _make_sign_initializer(random_sign_init): if random_sign_init > 0: return ed.initializers.RandomSign(random_sign_init) else: - return tf.keras.initializers.RandomNormal(mean=1.0, + return tf.python.keras.initializers.RandomNormal(mean=1.0, stddev=-random_sign_init) @@ -37,26 +37,26 @@ def make_initializer(initializer, random_sign_init, dropout_rate): stddev_init = np.log(np.expm1(np.sqrt(dropout_rate / (1. - dropout_rate)))) return ed.initializers.TrainableHalfCauchy( loc_initializer=_make_sign_initializer(random_sign_init), - scale_initializer=tf.keras.initializers.Constant(stddev_init), + scale_initializer=tf.python.keras.initializers.Constant(stddev_init), scale_constraint='softplus') elif initializer == 'trainable_cauchy': stddev_init = np.log(np.expm1(np.sqrt(dropout_rate / (1. - dropout_rate)))) return ed.initializers.TrainableCauchy( loc_initializer=_make_sign_initializer(random_sign_init), - scale_initializer=tf.keras.initializers.Constant(stddev_init), + scale_initializer=tf.python.keras.initializers.Constant(stddev_init), scale_constraint='softplus') elif initializer == 'trainable_normal': stddev_init = np.log(np.expm1(np.sqrt(dropout_rate / (1. - dropout_rate)))) return ed.initializers.TrainableNormal( mean_initializer=_make_sign_initializer(random_sign_init), - stddev_initializer=tf.keras.initializers.TruncatedNormal( + stddev_initializer=tf.python.keras.initializers.TruncatedNormal( mean=stddev_init, stddev=0.1), stddev_constraint='softplus') elif initializer == 'trainable_log_normal': stddev_init = np.log(np.expm1(np.sqrt(dropout_rate / (1. - dropout_rate)))) return ed.initializers.TrainableLogNormal( loc_initializer=_make_sign_initializer(random_sign_init), - scale_initializer=tf.keras.initializers.TruncatedNormal( + scale_initializer=tf.python.keras.initializers.TruncatedNormal( mean=stddev_init, stddev=0.1), scale_constraint='softplus') elif initializer == 'trainable_normal_fixed_stddev': @@ -67,7 +67,7 @@ def make_initializer(initializer, random_sign_init, dropout_rate): stddev_init = np.log(np.expm1(np.sqrt(dropout_rate / (1. - dropout_rate)))) return ed.initializers.TrainableNormalSharedStddev( mean_initializer=_make_sign_initializer(random_sign_init), - stddev_initializer=tf.keras.initializers.Constant(stddev_init), + stddev_initializer=tf.python.keras.initializers.Constant(stddev_init), stddev_constraint='softplus') return initializer diff --git a/experimental/sngp/gaussian_process_test.py b/experimental/sngp/gaussian_process_test.py index 9a8f7981..b0e7b36c 100644 --- a/experimental/sngp/gaussian_process_test.py +++ b/experimental/sngp/gaussian_process_test.py @@ -173,9 +173,9 @@ def test_state_saving_and_loading(self): input_data = np.random.random((1, 2)) rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1) - inputs = tf.keras.Input((2,), batch_size=1) + inputs = tf.python.keras.Input((2,), batch_size=1) outputs = rfgp_model(inputs) - model = tf.keras.Model(inputs, outputs) + model = tf.python.keras.Model(inputs, outputs) gp_output, gp_covmat = model.predict(input_data) # Save and then load the model. @@ -183,7 +183,7 @@ def test_state_saving_and_loading(self): self.addCleanup(shutil.rmtree, temp_dir) saved_model_dir = os.path.join(temp_dir, 'rfgp_model') model.save(saved_model_dir) - new_model = tf.keras.models.load_model(saved_model_dir) + new_model = tf.python.keras.models.load_model(saved_model_dir) gp_output_new, gp_covmat_new = new_model.predict(input_data) self.assertAllClose(gp_output, gp_output_new, atol=1e-4) diff --git a/experimental/sngp/normalization_test.py b/experimental/sngp/normalization_test.py index 29218b2e..c8b43ed0 100644 --- a/experimental/sngp/normalization_test.py +++ b/experimental/sngp/normalization_test.py @@ -29,8 +29,8 @@ import numpy as np import tensorflow as tf -DenseLayer = tf.keras.layers.Dense(10) -Conv2DLayer = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid') +DenseLayer = tf.python.keras.layers.Dense(10) +Conv2DLayer = tf.python.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid') def _compute_spectral_norm(weight):