3

基于官方Theano教程(http://deeplearning.net/tutorial/code/lstm.py)中提供的LSTM代码,我改变了LSTM层代码(即函数lstm_layer()param_init_lstm())以执行GRU。提供的LSTM代码训练良好,但不是GRU I编码:LSTM训练集的准确度上升到1(训练成本= 0),而GRU停滞在0.7(训练成本= 0) 0.3)。Theano中的GRU实现

下面是我用于GRU的代码。我保留了与教程中相同的函数名称,以便可以将代码直接复制粘贴到其中。什么能解释GRU的糟糕表现?

import numpy as np 
def param_init_lstm(options, params, prefix='lstm'): 
    """ 
    GRU 
    """ 
    W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate 
         ortho_weight(options['dim_proj']), 
         ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate 
         axis=1)   
    params[_p(prefix, 'W')] = W 

    U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate 
         ortho_weight(options['dim_proj']), 
         ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate 
         axis=1)   
    params[_p(prefix, 'U')] = U 

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate   
    params[_p(prefix, 'b')] = b.astype(config.floatX)  
    return params 


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 
    nsteps = state_below.shape[0] 
    if state_below.ndim == 3: 
     n_samples = state_below.shape[1] 
    else: 
     n_samples = 1 

    def _slice(_x, n, dim): 
     if _x.ndim == 3: 
      return _x[:, :, n * dim:(n + 1) * dim] 
     return _x[:, n * dim:(n + 1) * dim] 

    def _step(m_, x_, h_): 
     preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 
     preact += x_ 

     r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate 
     u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate 

     U_h_t = _slice(tparams[_p(prefix, 'U')], 2, options['dim_proj']) 
     x_h_t = _slice(x_, 2, options['dim_proj']) 

     h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t) 
     h = (1. - u) * h_ + u * h_t_temp   
     h = m_[:,None] * h + (1. - m_)[:,None] * h_ 

     return h 

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 
        tparams[_p(prefix, 'b')]) 

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step, 
           sequences=[mask, state_below], 
           outputs_info=[tensor.alloc(numpy_floatX(0.), 
                  n_samples, 
                  dim_proj)], 
           name=_p(prefix, '_layers'), 
           n_steps=nsteps) 

    return rval[0] 

回答

7

问题来自于最后一行,return rval[0]:应该改为return rval

在官方Theano教程(http://deeplearning.net/tutorial/code/lstm.py)提供的LSTM代码使用return rval[0]因为outputs_info包含2个元素:

rval, updates = theano.scan(_step, 
          sequences=[mask, state_below], 
          outputs_info=[tensor.alloc(numpy_floatX(0.), 
                 n_samples, 
                 dim_proj), 
              tensor.alloc(numpy_floatX(0.), 
                 n_samples, 
                 dim_proj)], 
          name=_p(prefix, '_layers'), 
          n_steps=nsteps) 
return rval[0] 

在GRU,outputs_info只包含一个元素:

outputs_info=[tensor.alloc(numpy_floatX(0.), 
          n_samples, 
          dim_proj)], 

,尽管括号中,它不会返回代表扫描输出的Theano变量列表的列表,而是直接返回一个Theano变量。然后

rval被馈送到一个汇集层(在这种情况下,平均池层):

enter image description here

通过仅在GRU代码rval服用rval[0]在GRU,由于是一个Theano变量,而不是一个一个Theano变量的列表中,则在除去红色矩形的部分:

enter image description here

这意味着你尝试执行第e句子分类只是使用第一个单词。


另一个GRU实现,可在LSTM教程中插入:

# weight initializer, normal by default 
def norm_weight(nin, nout=None, scale=0.01, ortho=True): 
    if nout is None: 
     nout = nin 
    if nout == nin and ortho: 
     W = ortho_weight(nin) 
    else: 
     W = scale * numpy.random.randn(nin, nout) 
    return W.astype('float32') 

def param_init_lstm(options, params, prefix='lstm'): 
    """ 
    GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py 
    """ 
    nin = options['dim_proj'] 
    dim = options['dim_proj'] 
    # embedding to gates transformation weights, biases 
    W = numpy.concatenate([norm_weight(nin, dim), 
          norm_weight(nin, dim)], axis=1) 
    params[_p(prefix, 'W')] = W 
    params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 

    # recurrent transformation weights for gates 
    U = numpy.concatenate([ortho_weight(dim), 
          ortho_weight(dim)], axis=1) 
    params[_p(prefix, 'U')] = U 

    # embedding to hidden state proposal weights, biases 
    Wx = norm_weight(nin, dim) 
    params[_p(prefix, 'Wx')] = Wx 
    params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 

    # recurrent transformation weights for hidden state proposal 
    Ux = ortho_weight(dim) 
    params[_p(prefix, 'Ux')] = Ux 
    return params 


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 

    nsteps = state_below.shape[0] 

    if state_below.ndim == 3: 
     n_samples = state_below.shape[1] 
    else: 
     n_samples = state_below.shape[0] 

    dim = tparams[_p(prefix, 'Ux')].shape[1] 

    if mask is None: 
     mask = tensor.alloc(1., state_below.shape[0], 1) 

    # utility function to slice a tensor 
    def _slice(_x, n, dim): 
     if _x.ndim == 3: 
      return _x[:, :, n*dim:(n+1)*dim] 
     return _x[:, n*dim:(n+1)*dim] 

    # state_below is the input word embeddings 
    # input to the gates, concatenated 
    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 
     tparams[_p(prefix, 'b')] 
    # input to compute the hidden state proposal 
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 
     tparams[_p(prefix, 'bx')] 

    # step function to be used by scan 
    # arguments | sequences |outputs-info| non-seqs 
    def _step_slice(m_, x_, xx_, h_,   U, Ux): 
     preact = tensor.dot(h_, U) 
     preact += x_ 

     # reset and update gates 
     r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 
     u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 

     # compute the hidden state proposal 
     preactx = tensor.dot(h_, Ux) 
     preactx = preactx * r 
     preactx = preactx + xx_ 

     # hidden state proposal 
     h = tensor.tanh(preactx) 

     # leaky integrate and obtain next hidden state 
     h = u * h_ + (1. - u) * h 
     h = m_[:, None] * h + (1. - m_)[:, None] * h_ 

     return h 

    # prepare scan arguments 
    seqs = [mask, state_below_, state_belowx] 
    _step = _step_slice 
    shared_vars = [tparams[_p(prefix, 'U')], 
        tparams[_p(prefix, 'Ux')]] 

    init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0) 

    rval, updates = theano.scan(_step, 
            sequences=seqs, 
            outputs_info=[init_state], 
            non_sequences=shared_vars, 
            name=_p(prefix, '_layers'), 
            n_steps=nsteps, 
            strict=True) 
    return rval 

作为一个侧面说明,Keras修正了这个问题,因为follows

results, _ = theano.scan(
    _step, 
    sequences=inputs, 
    outputs_info=[None] + initial_states, 
    go_backwards=go_backwards) 

# deal with Theano API inconsistency 
if type(results) is list: 
    outputs = results[0] 
    states = results[1:] 
else: 
    outputs = results 
    states = []