我在Windows 10上使用TensorFlow 1.2.1,并使用Estimator API。一切都运行没有任何错误,但每当我必须从检查点恢复参数,它的某些方面不起作用。我已经检查过,classifier.get_variable_names()中的每个变量的值在评估后都没有变化,但是Loss会回到接近开始位置的位置,然后继续学习,每次学习速度都比上次更快。TensorFlow Estimator正确恢复所有变量,但之后出现损失
这发生在一个TensorFlow运行中,发生验证或评估运行时,或者当我重新运行python文件以继续训练时。
下图是这个问题的一个例子,他们正在恢复每2500步的变量:
下面的代码是我的代码,significiantly缩小版本,仍然复制错误:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()
def cnn_model_fn(features, labels, mode):
dense_layer1 = tf.layers.dense(inputs=features, units=512, activation=tf.nn.relu, name="FC_1")
dense_layer2 = tf.layers.dense(inputs=dense_layer1, units=1024, activation=tf.nn.relu, name="FC_2")
dense_layer3 = tf.layers.dense(inputs=dense_layer2, units=2048, activation=tf.nn.relu, name="FC_3")
dense_layer4 = tf.layers.dense(inputs=dense_layer3, units=512, activation=tf.nn.relu, name="FC_4")
logits = tf.layers.dense(inputs=dense_layer4, units=2, name="logit_layer")
loss = None
train_op = None
if mode != learn.ModeKeys.INFER:
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
if mode == learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=.001,
optimizer="SGD")
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(
logits, name="softmax_tensor")}
return model_fn_lib.ModelFnOps(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
def main(unused_param):
def data_pipeline(filenames, batch_size, num_epochs=None, min_after_dequeue=10000):
with tf.name_scope("data_pipeline"):
filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
row = tf.decode_csv(value, record_defaults=[[0.0] for _ in range(66)])
example_op, label_op = tf.stack(row[:len(row)-2]), tf.stack(row[len(row)-2:])
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example_op, label_op],
batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
def input_data_fn(data_getter_ops):
batch, labels = sess.run(data_getter_ops)
return tf.constant(batch, dtype=tf.float32), tf.constant(labels, dtype=tf.float32)
NUM_EPOCHS = 6
BATCHES_IN_TRAINING_EPOCH = 8000
training_data_pipe_ops = data_pipeline(
filenames=["train_data.csv"],
batch_size=500,
min_after_dequeue=10000)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
classifier = tf.contrib.learn.Estimator(
model_fn=cnn_model_fn,
model_dir="/tmp/bug_finder")
for j in range(NUM_EPOCHS):
classifier.fit(
input_fn=lambda: input_data_fn(training_data_pipe_ops),
steps = BATCHES_IN_TRAINING_EPOCH)
print("Epoch", str(j+1), "training completed.")
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
tf.app.run()