基于官方Theano教程(http://deeplearning.net/tutorial/code/lstm.py)中提供的LSTM代码,我改变了LSTM层代码(即函数lstm_layer()
和param_init_lstm()
)以执行GRU。提供的LSTM代码训练良好,但不是GRU I编码:LSTM训练集的准确度上升到1(训练成本= 0),而GRU停滞在0.7(训练成本= 0) 0.3)。Theano中的GRU实现
下面是我用于GRU的代码。我保留了与教程中相同的函数名称,以便可以将代码直接复制粘贴到其中。什么能解释GRU的糟糕表现?
import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
"""
GRU
"""
W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate
axis=1)
params[_p(prefix, 'W')] = W
U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
axis=1)
params[_p(prefix, 'U')] = U
b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate
params[_p(prefix, 'b')] = b.astype(config.floatX)
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
def _step(m_, x_, h_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate
U_h_t = _slice(tparams[_p(prefix, 'U')], 2, options['dim_proj'])
x_h_t = _slice(x_, 2, options['dim_proj'])
h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
h = (1. - u) * h_ + u * h_t_temp
h = m_[:,None] * h + (1. - m_)[:,None] * h_
return h
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]