我是 Tensorflow 的新手,很抱歉,因为这似乎是一个非常基本的问题,但不幸的是我在 Google 上找不到任何内容,也许我使用了错误的关键字。
我有一些从占位符派生的表达式(据我了解张量流的逻辑),以及一些需要在不重新计算“占位符”表达式的情况下进行计算的变量。下面是我相当丑陋的代码(应该是手动构建的 3 层神经网络),其中评估发生在循环中。
问题是,当我计算派生表达式(ys、delta)时,我想在一次运行中评估所有权重,而不会错误地重新计算 ys 和 delta,我认为目前应该会发生这种情况。该代码中可能存在其他错误,导致其无法正常工作(但是,以这种方式编写的 1 层代码可以正常工作,并且达到预期的 92% 的准确率),但至少在计算阶段完成之前很难弄清楚。没有搞砸。
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
#launch tensorflow session
import tensorflow as tf
sess = tf.InteractiveSession(config=tf.ConfigProto(
intra_op_parallelism_threads=4))
def nonlin(x,deriv=False): # I want my custom activation function
if(deriv==True):
return tf.nn.sigmoid(x)*(1 - tf.nn.sigmoid(x))
return tf.nn.sigmoid(x)
#placeholders
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
# Variables ----------------
#weights and biases
W1 = tf.Variable(tf.zeros([784,10])) # 784x10 matrix (because we have 784 input features and 10 outputs)
b1 = tf.Variable(tf.zeros([10]))
W2 = tf.Variable(tf.zeros([10,10]))
b2 = tf.Variable(tf.zeros([10]))
W3 = tf.Variable(tf.zeros([10,10]))
b3 = tf.Variable(tf.zeros([10]))
# ---------------------
sess.run(tf.global_variables_initializer())
# derived expressions -------------------------
# Forward pass
y1 = nonlin(tf.matmul(x,W1) + b1)
y2 = nonlin(tf.matmul(y1,W2) + b2)
y3 = nonlin(tf.matmul(y2,W3) + b3)
error3 = y_ - y3 # quadratic cost derivative
# Backward pass
delta3 = tf.multiply(error3,nonlin(y3, deriv=True)) #assign delta
error2 = tf.matmul(delta3,W3, transpose_b=True)
delta2 = tf.multiply(error2,nonlin(y2, deriv=True))
error1 = tf.matmul(delta2,W2, transpose_b=True)
delta1 = tf.multiply(error1,nonlin(y1, deriv=True))
learning_rate = 0.1
# And my ugly update step which is not working:------------------------
w1_assign = tf.assign(W1, tf.add(W1, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(x,-1), tf.expand_dims(delta1,-1), transpose_b=True), 0)) ))
b1_assign = tf.assign(b1, tf.add(b1, tf.multiply(learning_rate, tf.reduce_mean(delta1, 0)) ))
w2_assign = tf.assign(W2, tf.add(W2, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(y1,-1), tf.expand_dims(delta2,-1), transpose_b=True), 0)) ))
b2_assign = tf.assign(b2, tf.add(b2, tf.multiply(learning_rate, tf.reduce_mean(delta2, 0)) ))
w3_assign = tf.assign(W3, tf.add(W3, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(y2,-1), tf.expand_dims(delta3,-1), transpose_b=True), 0)) ))
b3_assign = tf.assign(b3, tf.add(b3, tf.multiply(learning_rate, tf.reduce_mean(delta3, 0)) ))
# accuracy evaluation ----------------------
correct_prediction = tf.equal(tf.argmax(y3,1), tf.argmax(y_,1)) #a list of booleans.
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Main loop:----------------------
for epoch in range(1000):
batch = mnist.train.next_batch(1000)
# apply across batch
sess.run(w1_assign , feed_dict={x: batch[0], y_: batch[1]})
sess.run(b1_assign , feed_dict={x: batch[0], y_: batch[1]})
sess.run(w2_assign , feed_dict={x: batch[0], y_: batch[1]})
sess.run(b2_assign , feed_dict={x: batch[0], y_: batch[1]})
sess.run(w3_assign , feed_dict={x: batch[0], y_: batch[1]})
sess.run(b3_assign , feed_dict={x: batch[0], y_: batch[1]})
# precision computation
print(str(accuracy.eval(feed_dict={x: batch[0], y_: batch[1]})) + " / epoch: " + str(epoch)) # evaluate
UPDATE:
基于这个答案 https://stackoverflow.com/a/41309537/1692060,看起来如果我在列表中向 sess.run 提供参数,我将只初始化所有中间变量一次,但顺序未知。然后我尝试了我的网络,进行了以下修改,其中包括在列表中传递参数和附加变量来存储新的权重,并使它们不会与原始的权重混淆(很抱歉代码很长,但我试图让它立即为您执行) :
def nonlin(x,deriv=False):
if(deriv==True):
return tf.nn.sigmoid(x)*(1 - tf.nn.sigmoid(x))
return tf.nn.sigmoid(x)
#We start building the computation graph by creating nodes for the input images and target output classes.
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
#weights and biases
W1 = tf.Variable(tf.random_uniform([784,400])) # 784x10 matrix (because we have 784 input features and 10 outputs)
b1 = tf.Variable(tf.random_uniform([400]))
W2 = tf.Variable(tf.random_uniform([400,30])) # 784x10 matrix (because we have 784 input features and 10 outputs)
b2 = tf.Variable(tf.random_uniform([30]))
W3 = tf.Variable(tf.random_uniform([30,10])) # 784x10 matrix (because we have 784 input features and 10 outputs)
b3 = tf.Variable(tf.random_uniform([10]))
# temporary containers to avoid messing up computations
W1tmp = tf.Variable(tf.zeros([784,400])) # 784x10 matrix (because we have 784 input features and 10 outputs)
b1tmp = tf.Variable(tf.zeros([400]))
W2tmp = tf.Variable(tf.zeros([400,30])) # 400x30 matrix as second layer
b2tmp = tf.Variable(tf.zeros([30]))
W3tmp = tf.Variable(tf.zeros([30,10])) # 30x10 matrix (because we have 10 outputs)
b3tmp = tf.Variable(tf.zeros([10]))
#Before Variables can be used within a session, they must be initialized using that session.
sess.run(tf.global_variables_initializer())
# multiplication across batch
# The tf.batch_matmul() op was removed in 3a88ec0. You can now use tf.matmul() to perform batch matrix multiplications (i.e. for tensors with rank > 2).
# Forward pass
y1 = nonlin(tf.matmul(x,W1) + b1)
y2 = nonlin(tf.matmul(y1,W2) + b2)
y3 = nonlin(tf.matmul(y2,W3) + b3)
error3 = y_ - y3 # quadratic cost derivative
# Backward pass
# error and y have same dimensions. It's only W that is unique
delta3 = tf.multiply(error3,nonlin(y3, deriv=True)) #assign delta
error2 = tf.matmul(delta3,W3, transpose_b=True)
delta2 = tf.multiply(error2,nonlin(y2, deriv=True))
error1 = tf.matmul(delta2,W2, transpose_b=True)
delta1 = tf.multiply(error1,nonlin(y1, deriv=True))
learning_rate = tf.constant(3.0)
# we first assign the deepest level to avoid extra evaluations
#with tf.control_dependencies([y1,y2,y3,delta1,delta2,delta3]):
w1_assign = tf.assign(W1tmp, tf.add(W1, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(x,-1), tf.expand_dims(delta1,-1), transpose_b=True), 0)) ))
b1_assign = tf.assign(b1tmp, tf.add(b1, tf.multiply(learning_rate, tf.reduce_mean(delta1, 0)) ))
w2_assign = tf.assign(W2tmp, tf.add(W2, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(y1,-1), tf.expand_dims(delta2,-1), transpose_b=True), 0)) ))
b2_assign = tf.assign(b2tmp, tf.add(b2, tf.multiply(learning_rate, tf.reduce_mean(delta2, 0)) ))
w3_assign = tf.assign(W3tmp, tf.add(W3, tf.multiply(learning_rate, tf.reduce_mean(tf.matmul(tf.expand_dims(y2,-1), tf.expand_dims(delta3,-1), transpose_b=True), 0)) ))
b3_assign = tf.assign(b3tmp, tf.add(b3, tf.multiply(learning_rate, tf.reduce_mean(delta3, 0)) ))
w1_ok = tf.assign(W1,W1tmp)
w2_ok = tf.assign(W2,W2tmp)
w3_ok = tf.assign(W3,W3tmp)
b1_ok = tf.assign(b1,b1tmp)
b2_ok = tf.assign(b2,b2tmp)
b3_ok = tf.assign(b3,b3tmp)
#accuracy evaluation
correct_prediction = tf.equal(tf.argmax(y3,1), tf.argmax(y_,1)) #a list of booleans.
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# we can use only single batch, just to check that everything works
#batch = mnist.train.next_batch(1000)
for epoch in range(10000):
batch = mnist.train.next_batch(1000)
#train_step.run(feed_dict={x: batch[0], y_: batch[1]})
#When you call sess.run([x, y, z]) once, TensorFlow executes each op that those tensors depend on one time only (unless there's a tf.while_loop() in your graph). If a tensor appears twice in the list (like mul in your example), TensorFlow will execute it once and return two copies of the result. To run the assignment more than once, you must either call sess.run() multiple times, or use tf.while_loop() to put a loop in your graph.
# write new variable values to containers
sess.run([w1_assign,w2_assign,w3_assign,b1_assign,b2_assign,b3_assign] , feed_dict={x: batch[0], y_: batch[1]})
# write container contents into variables in a separate session
sess.run([w1_ok,w2_ok,w3_ok,b1_ok,b2_ok,b3_ok])# , feed_dict={x: batch[0], y_: batch[1]})
# precision computation
print(str(accuracy.eval(feed_dict={x: batch[0], y_: batch[1]})) + " / epoch: " + str(epoch)) # evaluate
所以问题是它是否至少是正确的 Tensorflow 代码?我发现网络结构和学习率给出了一些结果,但它们似乎仍然很差(大约 75%)。