1 year ago

#120168

test-img

GPrathap

How to improve the execution time of gradient estimation using tf.GradientTape in tensorflow 2.x

The objective is to implement https://web.casadi.org/blog/tensorflow/, which was written using Tensorflow 1.x and gpflow 1.x, using Tensorflow 2.x and gpflow 2.x .

I have implemented it in Tensorflow 2.x (I attached them at the end of the question). However, still, I could not get as faster as the initial implementation, i.e., using Tensorflow 1.x and gpflow 1.x.

Here are execution time

Using Tensorflow 1.x and gpflow 1.x.

  EXIT: Optimal Solution Found.
          solver  :   t_proc      (avg)   t_wall      (avg)    n_eval
           nlp_f  |  50.62ms (  1.63ms)  15.51ms (500.32us)        31
           nlp_g  |   4.16ms (134.16us)   4.15ms (133.90us)        31
      nlp_grad_f  | 216.76ms (  6.77ms) 153.84ms (  4.81ms)        32
       nlp_jac_g  |  21.29ms (665.25us)  21.46ms (670.63us)        32
           total  | 332.16ms (332.16ms) 233.83ms (233.83ms)         1
    Ncalls 63
    Total time [s] 0.08702826499938965

Using Tensorflow 2.x and gpflow 2.x.

 EXIT: Optimal Solution Found.
        solver  :   t_proc      (avg)   t_wall      (avg)    n_eval
         nlp_f  |  14.11ms (  1.76ms)   5.78ms (722.38us)         8
         nlp_g  |   1.03ms (129.00us)   1.05ms (130.77us)         8
    nlp_grad_f  |   1.23 s (136.79ms)   1.21 s (134.41ms)         9
     nlp_jac_g  |   6.03ms (670.00us)   6.03ms (670.34us)         9
         total  |   1.26 s (  1.26 s)   1.23 s (  1.23 s)         1
  Ncalls 17
  Total time [s] 0.8965554237365723

According to these execution times, it mainly depends on the gradient estimation nlp_grad_f. Tensorflow 1.x uses graph-based gradient estimation which seems to be rather faster than Tensorflow 2.x tf.GradientTape

My question as follows: What's the proper way to implement the gradient estimation in Tensorflow 2.x or how to improve this code further? I've checked https://www.tensorflow.org/api_docs/python/tf/function and what I could to make it faster. I want it to reduce as faster as I get in Tensorflow 1.x.

    @tf.function
    def f_k(input_dat, get_grad_val=None):
        xf_tensor = input_dat
        if(get_grad_val is not None):
            with tf.GradientTape(persistent=True) as tape:
                tape.watch(xf_tensor)
                mean, _ = model.predict_y(xf_tensor)
            grad_mean = tape.gradient(mean, xf_tensor, output_gradients=get_grad_val)
        else:
            mean, _ = model.predict_y(xf_tensor)
            grad_mean = None
        return mean, grad_mean

For reference, I include all the codebase, in case someone wants to execute and see it.

ocp.py

from casadi import *
T = 10. # Time horizon
N = 20 # number of control intervals

# Declare model variables
x1 = MX.sym('x1')
x2 = MX.sym('x2')
x = vertcat(x1, x2)
u = MX.sym('u')

# Model equations
xdot = vertcat((1-x2**2)*x1 - x2 + u, x1)


# Formulate discrete time dynamics
if False:
   # CVODES from the SUNDIALS suite
   dae = {'x':x, 'p':u, 'ode':xdot}
   opts = {'tf':T/N}
   F = integrator('F', 'cvodes', dae, opts)
else:
   # Fixed step Runge-Kutta 4 integrator
   M = 4 # RK4 steps per interval
   DT = T/N/M
   f = Function('f', [x, u], [xdot])
   X0 = MX.sym('X0', 2)
   U = MX.sym('U')
   X = X0
   Q = 0
   for j in range(M):
       k1 = f(X, U)
       k2 = f(X + DT/2 * k1, U)
       k3 = f(X + DT/2 * k2, U)
       k4 = f(X + DT * k3, U)
       X=X+DT/6*(k1 +2*k2 +2*k3 +k4)
   F = Function('F', [X0, U], [X],['x0','p'],['xf'])

# Start with an empty NLP
w=[]
w0 = []
lbw = []
ubw = []
g=[]
lbg = []
ubg = []

# "Lift" initial conditions
Xk = MX.sym('X0', 2)
w += [Xk]
lbw += [0, 1]
ubw += [0, 1]
w0 += [0, 1]

# Formulate the NLP
for k in range(N):
    # New NLP variable for the control
    Uk = MX.sym('U_' + str(k))
    w   += [Uk]
    lbw += [-1]
    ubw += [1]
    w0  += [0]

    # Integrate till the end of the interval
    Fk = F(x0=Xk, p=Uk)
    Xk_end = Fk['xf']

    # New NLP variable for state at end of interval
    Xk = MX.sym('X_' + str(k+1), 2)
    w   += [Xk]
    lbw += [-0.25, -inf]
    ubw += [  inf,  inf]
    w0  += [0, 0]

    # Add equality constraint
    g   += [Xk_end-Xk]
    lbg += [0, 0]
    ubg += [0, 0]

nd = N+1

import gpflow
import time
import tensorflow as tf 
from tensorflow_casadi_mod_ocp import TensorFlowEvaluator

class GPR(TensorFlowEvaluator):
  def __init__(self, model, opts={}):
    initializer = tf.random_normal_initializer(mean=1., stddev=2.)
    X = tf.Variable(initializer(shape=[1,nd], dtype=tf.float64), trainable=False)

    @tf.function
    def f_k(input_dat, get_grad_val=None):
        xf_tensor = input_dat
        if(get_grad_val is not None):
            with tf.GradientTape(persistent=True) as tape:
                tape.watch(xf_tensor)
                mean, _ = model.predict_y(xf_tensor)
            grad_mean = tape.gradient(mean, xf_tensor, output_gradients=get_grad_val)
        else:
            mean, _ = model.predict_y(xf_tensor)
            grad_mean = None
        return mean, grad_mean
    
    TensorFlowEvaluator.__init__(self, [X], [f_k], model, opts=opts)
    self.counter = 0
    self.time = 0

  def eval(self,arg):
    self.counter += 1
    t0 = time.time()
    ret = TensorFlowEvaluator.eval(self, arg)
    self.time += time.time()-t0
    return [ret]



# Create
np.random.seed(0)
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))

model = gpflow.models.GPR(data=(data, value), kernel=gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
optimizer = gpflow.optimizers.Scipy()
optimizer.minimize(
    model.training_loss,
    variables = model.trainable_variables,
    options=dict(disp=True, maxiter=100),
)

import tensorflow as tf

opts = {}
opts["output_dim"] = [1, 1]
opts["grad_dim"] = [1, nd]
GPR = GPR(model, opts=opts)
w = vertcat(*w)

# # Create an NLP solver
prob = {'f': GPR(w[0::3]), 'x': w , 'g': vertcat(*g)}
# options = {"ipopt": {"hessian_approximation": "limited-memory"}}
options = {"ipopt": {"hessian_approximation": "limited-memory"}}
solver = nlpsol('solver', 'ipopt', prob,options);
sol = solver(x0=w0, lbx=lbw, ubx=ubw, lbg=lbg, ubg=ubg)

print("Ncalls",GPR.counter)
print("Total time [s]",GPR.time)
w_opt = sol['x'].full().flatten()

# Plot the solution
x1_opt = w_opt[0::3]
x2_opt = w_opt[1::3]
u_opt = w_opt[2::3]

tgrid = [T/N*k for k in range(N+1)]
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
plt.plot(tgrid, x1_opt, '--')
plt.plot(tgrid, x2_opt, '-')
plt.step(tgrid, vertcat(DM.nan(1), u_opt), '-.')
plt.xlabel('t')
plt.legend(['x1','x2','u'])
plt.grid()
plt.show()

tensorflow_casadi_mod_ocp.py

import casadi as ca
import tensorflow as tf
from casadi import Sparsity
import gpflow 
import numpy as np

class TensorFlowEvaluator(ca.Callback):
  def __init__(self, t_in, t_out, model, set_init=False, opts={}):
  
    self.set_init = set_init
    self.opts = opts
    ca.Callback.__init__(self)
    assert isinstance(t_in,list)
    self.t_in = t_in
    assert isinstance(t_out, list)
    self.t_out = t_out
    self.output_shapes = []
    self.construct("TensorFlowEvaluator", {})
    self.refs = []
    self.model = model
    

  def get_n_in(self): return len(self.t_in)

  def get_n_out(self): return len(self.t_out)

  def get_sparsity_in(self, i):
      tesnor_shape = self.t_in[i].shape
      return Sparsity.dense(tesnor_shape[0], tesnor_shape[1])

  def get_sparsity_out(self, i):
      if(i == 0 and self.set_init is False):
        tensor_shape = [self.opts["output_dim"][0], self.opts["output_dim"][1]]
      elif (i == 0 and self.set_init is True):
        tensor_shape = [self.opts["grad_dim"][0], self.opts["grad_dim"][1]]
      else:
         tensor_shape = [self.opts["output_dim"][0], self.opts["output_dim"][1]]
      return Sparsity.dense(tensor_shape[0], tensor_shape[1])

  def eval(self, arg):
    updated_t = []
    for i,v in enumerate(self.t_in):
        updated_t.append(tf.Variable(arg[i].toarray()))
    if(len(updated_t) == 1):
      out_, grad_estimate = self.t_out[0](tf.convert_to_tensor(updated_t[0]))
    else:
      out_, grad_estimate = self.t_out[0](tf.convert_to_tensor(updated_t[0]), tf.convert_to_tensor(updated_t[1]))

    if(len(updated_t) == 1):
          selected_set =  out_.numpy() 
    else:
          selected_set = grad_estimate.numpy()
    return [selected_set]

  # Vanilla tensorflow offers just the reverse mode AD
  def has_reverse(self,nadj): return nadj==1
  
  def get_reverse(self, nadj, name, inames, onames, opts):
    initializer = tf.random_normal_initializer(mean=1., stddev=2.)
    adj_seed = [ tf.Variable(initializer(shape=self.sparsity_out(i).shape, dtype=tf.float64)) for i in range(self.n_out())]

    callback = TensorFlowEvaluator(self.t_in + adj_seed, [self.t_out[0]], self.model, set_init=True, opts=self.opts)
    self.refs.append(callback)

    nominal_in = self.mx_in()
    nominal_out = self.mx_out()
    adj_seed = self.mx_out()
    casadi_bal = callback.call(nominal_in + adj_seed)
    return ca.Function(name, nominal_in+nominal_out+adj_seed, casadi_bal, inames, onames)

python

tensorflow

optimization

tensorflow2.0

casadi

0 Answers

Your Answer

Accepted video resources