2017-06-04 46 views
0

受Andrej Karpathy的blog启发我想制作自己的循环神经网络版本,选择下一个单词而不是字符。 由于文本中不同单词的数量非常多,我使用word2vec来将单词表示为向量(在向量空间中相似单词更接近)。神经网络现在应该训练以从旧模式中学习新的矢量。Lasagne LSTM回归生成零输出

- 一个重要的注意事项是,在Karpathy使用分类器的情况下,我尝试使用回归方法(平方损失成本)。

我的问题是,无论训练多少,我的神经网络预测输出[0,0,0 ....,0]。所以我的猜测是我的训练或预测方法存在问题(训练期间平均误差有点下降,因此必须进行一些训练)

下面是我的整个代码,如果有人想运行它(它使用棕色语料库,因此需要安装nltk才能正常工作)。

这是我在烤宽面条的“Hello World”项目,所以任何指针,如果我做一些愚蠢的赞赏。 在此先感谢:)

from gensim.models import Word2Vec 
import gensim 
import sys 
from datetime import timedelta 
import matplotlib.pyplot as plt 
from nltk.corpus import brown 
import theano.tensor as T 
import theano 
import time 
import numpy as np 
from lasagne import layers 
import lasagne 
from lasagne.updates import nesterov_momentum 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.manifold import TSNE 


def modelExcept(input, model, size): 
    try: 
     out = model[input] 
     return out 
    except Exception: 
     out = np.zeros((size)) 
     print 'exception ' + str(input) 
     return out 

def plot_TSNE(model,nr_words=None): 
    tsne = TSNE(n_components=2) 
    if nr_words == None: 
     X_tsne = tsne.fit_transform(model[model.wv.vocab][:]) 
    else: 
     X_tsne = tsne.fit_transform(model[model.wv.vocab][0:nr_words]) 

    X_names = [key for key in model.wv.vocab] 
    plt.figure() 
    ax = plt.subplot(111) 
    for i in range(X_tsne.shape[0]): 
     plt.text(X_tsne[i, 0], X_tsne[i, 1], str(X_names[i]), 
        #color=plt.cm.Set1(y[i]/10.), 
        fontdict={'weight': 'bold', 'size': 9}) 

    plt.xticks([]), plt.yticks([]) 
    plt.draw() 
    #plt.scatter(X_tsne[:, 0], X_tsne[:, 1]) 
    #plt.show() 

def getBatch(words_as_vecs , wordSize,totalwords, windowSize, BATCHSIZE): 

    BatchIndexes = np.random.randint(0,totalwords-windowSize, size=BATCHSIZE) 
    input = np.empty((BATCHSIZE,windowSize,wordSize),dtype=np.float32) 
    target = np.empty((BATCHSIZE,wordSize),dtype=np.float32) 
    for i in range(BATCHSIZE): 
     k = BatchIndexes[i] 
     input[i,:,:] = words_as_vecs[k:k+windowSize,:] 
     target[i,:] = words_as_vecs[k+windowSize,:] 

    return input, target 


wordSize = 30 
windowSize = 5 
BATCHSIZE = 128 
LEARNING_RATE = .1 
Nr_EPOCHS = 100 
NR_Predictions = 15 


model_raw = Word2Vec(brown.sents(),workers=4,window=10,iter=15,size=wordSize, min_count=10) 
#plot_TSNE(model_raw,None) 
model = model_raw.wv #trim model after training to save RAM 
del model_raw 

words_filtered = filter(lambda x: x in model.vocab, brown.words())#filter away words that are not in vocabulary 
words_as_vecs = np.asarray([modelExcept(word, model,wordSize) for word in words_filtered],dtype = np.float32) #create all vector representations beforehand to save time!! 
scaler = MinMaxScaler(feature_range=(0,1)) 
words_as_vecs = scaler.fit_transform(words_as_vecs) 

print 'creating neural net...' 

Num_units_per_layer = 512 
GRAD_CLIP = 100 
l_in = lasagne.layers.InputLayer(shape=(None,None,wordSize)) 
l_LSTM1 = lasagne.layers.LSTMLayer(l_in,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify) 
l_drop1 = lasagne.layers.DropoutLayer(l_LSTM1,p=0.5) 
l_LSTM2 = lasagne.layers.LSTMLayer(l_drop1,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify, only_return_final=True) 
l_drop2 = lasagne.layers.DropoutLayer(l_LSTM2,p=0.5) 
l_shp = lasagne.layers.ReshapeLayer(l_drop2,(-1,Num_units_per_layer)) 
l_out = lasagne.layers.DenseLayer(l_shp,num_units=wordSize,W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.rectify) 

target_vals = T.imatrix('target values') 
net_out = lasagne.layers.get_output(l_out) 
net_out_predict = lasagne.layers.get_output(l_out,deterministic = True) 

#use squared error because the problem is now a regession problem 
cost = T.sum(lasagne.objectives.squared_error(net_out,target_vals)) 

all_params = lasagne.layers.get_all_params(l_out, trainable = True) 
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) 

net_train = theano.function([l_in.input_var, target_vals], cost, updates=updates, allow_input_downcast=True) 
compute_cost = theano.function([l_in.input_var, target_vals], cost, allow_input_downcast=True) 
net_predict = theano.function([l_in.input_var],net_out_predict,allow_input_downcast=True) 

print 'creating testphrase...' 
testphrase_vectors = np.empty((1,5,wordSize),dtype=np.float32) 
testphrase_vectors[0,:,:] = words_as_vecs[1:6,:] 
testphrase_words = words_filtered[0:6] 
#testphrase_words = brown.words()[0:6] 

print 'training...' 
avg_cost = 0 
totalwords = len(words_filtered) 
#totalwords = len(brown.words()) 
print_freq = totalwords/BATCHSIZE #print example every epoch 

nrItterations = Nr_EPOCHS*totalwords/BATCHSIZE 

for i in range(nrItterations): 
    inTrain, target = getBatch(words_as_vecs, wordSize, totalwords, windowSize, BATCHSIZE) 
    avg_cost += net_train(inTrain,target) 

    #generate text sample 
    if (i%print_freq == 0) and (i != 0): 
     print 'prediction of train' 

     print 'average cost is {0}' .format(avg_cost/(BATCHSIZE*print_freq)) 
     avg_cost = 0 
     generated_example = ' '.join(testphrase_words) 
     testphrase_vectors_copy = testphrase_vectors 
     for k in range(NR_Predictions): 
      prediction = np.asarray(net_predict(testphrase_vectors_copy)) 
      prediction_unscaled = scaler.inverse_transform(prediction.reshape(1,-1)).reshape(-1) 
      current_word = model.most_similar(positive=[prediction_unscaled], topn=1) 

      generated_example = ' '.join((generated_example, current_word[0][0])) 

      #insert new word in testphrase (and delete first) 
      testphrase_vectors_copy[0,0:-1,:] = testphrase_vectors_copy[0,1:,:] 
      testphrase_vectors_copy[0,-1,:] = model[current_word[0][0]] 
      #print testphrase_vectors_copy 
     print 'example nr. {}' .format(i/print_freq + 1) 
     print generated_example 
     print '\n \n' 
+0

我没有仔细看过这个细节,因为我没有在千层面工作,但是我会质疑你是否已经为回归正确地形成了你的模型,因为你使用的是平方误差损失函数,但是你的输出模型通过relu激活函数运行,这似乎是一种奇怪的方法。我可能以前没有遇到过这种方法。 –

+0

感谢您的回复。我以前尝试在范围[-1,1]中缩放单词向量,并使用tanh激活函数,但结果相同。我现在有这个理由的原因只是为了鼓励sparce的活动。 –

+0

您的标签(地面实况数据)结构如何?回归方法意味着什么?我最初假定你的意思是一个线性回归方法(这将与使用平方误差一致),在这种情况下,我将在最后的密集层之后不使用激活。如果你的意思是逻辑回归(这似乎有点错误的分类),并且你的标签是一个热门编码,那么在使用丢失函数将输出与标签进行比较之前,我会认为你需要在relu激活之后softmax结果。 –

回答

0

我终于找到了错误。

问题是此行:

target_vals = T.imatrix( '目标值')

这应该是:

target_vals = T.fmatrix( '目标值')

因为我瞄准后漂浮,而不是整数。