Softmax分类和两层神经网络以及反向传播的代码推导

2023-11-13

发现草稿箱里还有一篇很早之前的学习笔记，希望可以帮助到有需要的童鞋~

序

Softmax分类器

反向传播

数据构建以及网络训练

交叉验证参数优化

序

原来都是用的c++学习的传统图像分割算法。主要学习聚类分割、水平集、图割，欢迎一起讨论学习。

刚刚开始学习cs231n的课程，正好学习python，也做些实战加深对模型的理解。

课程链接

1、这是自己的学习笔记，会参考别人的内容，如有侵权请联系删除。

2、代码参考WILL 、杜克，但是有了很多自己的学习注释

3、有些原理性的内容不会讲解，但是会放上我觉得讲的不错的博客链接

4、由于之前没怎么用过numpy，也对python不熟，所以也是一个python和numpy模块的学习笔记

本章前言：softmax本来应该单独成为一节，但是由于内容和前面的章节高度重复，神经网络中又使用了softmax输出，考虑之后将softmax的内容和neural network放到一起。阅读本章内容需要有神经网络的基本知识。本章重点：反向传播

在jupyter中写的代码，要import需要下载成为.py文件，import之后如果.py文件中的内容有了修改需要重新打开jupyter，很麻烦，现在在import之后加上以下代码，更改.py文件后就不需要重新打开jupyter了。

#自动加载外部模块
%reload_ext autoreload
%autoreload 2

Softmax分类器

softmax分类器可以看作是在svm得分的基础上，利用公式将得分转化为相对应的概率。

给出softmax的代价函数的代码实现，同样分为朴素版和向量版

import numpy as np
from random import shuffle

def softmax_loss_naive(W, X, y, reg):
    loss = 0.0
    num_classes = W.shape[1]
    num_train = X.shape[0]
    for i in range(num_train):
        f_i = X[i].dot(W)
        f_i = f_i - np.max(f_i)  #减小数值，增强稳定性
        sum_j = np.sum(np.exp(f_i))
        p = lambda k:np.exp(f_i[k])/sum_j
        loss += -np.log(p(y[i]))       #只要计算正确的那一类的损失
        
        #计算梯度
        for k in range(num_classes):
            p_k = p(k)
            dW[:,k]+=(p_k-(k==y[i]))*X[i]
    loss / = num_train
    loss+= 0.5 * reg * np.sum(W*W)
    dW /= num_train
    dW += reg*W
    
    return loss,dW

def softmax_loss_vectorized(W,X,y,reg):
    num_train = X.shape[0]
    f = X.dot(W)
    f -= np.max(f,axis =1,keepdims = True)
    sum_f = np.sum(np.exp(f),axis=1,keepdims=True)
    p = np.exp(f)/sum_f
    
    loss = np.sum(-np.log(p[np.arange(num_train),y]))  #查找每一行正确分类的元素
    
    ind = np.zeros_like(p)
    ind[np.arange(num_train),y]=1       #正确分类需要用概率减一，创建0矩阵，正确分类部分为1
    dW = X.T.dot(p-ind)
    
    loss / = num_train
    loss+= 0.5 * reg * np.sum(W*W)
    dW /= num_train
    dW += reg*W
    
    return loss,dW

反向传播

上面代码中计算权重矩阵梯度时用到了反向传播中的链式求导法则的思想。关于反向传播的理解，这里给出两个链接帮助理解：深度学习——对于反向传播的理解(举例验证)

CS231n课程笔记翻译：反向传播笔记

读完这两篇对反向传播中的链式求导有了基础了解，下面就是一个二层神经网络的反向传播的推导过程（字丑），其中softmax部分的推导参考课程。

参考上述推导过程，下面给出两层神经网络的实现方式：

import numpy as np

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size,std=1e-4):
        self.params = {}
        #初始化权重矩阵
        self.params['W1'] = std * np.random.randn(input_size,hidden_size)   #W1.shape = (输入层特征数，隐藏层特征数)
        self.params['b1'] = np.zeros(hidden_size)                           #b1.shape = (1, 隐藏层特征数)
        self.params['W2'] = std * np.random.randn(hidden_size,output_size)  #W2.shape = (输入层特征数，隐藏层特征数)
        self.params['b2'] = np.zeros(output_size)
        
    def loss(self,X,y = None,reg = 0.0):
        W1, b1 = self.params['W1'],self.params['b1']
        W2, b2 = self.params['W2'],self.params['b2']
        N,D = X.shape     #N样本数num，D特征维度dimension
        
        scores = None     #得分
        
        h_output = np.maximum(0,X.dot(W1) + b1)   #隐藏层输出（N,H），Relu激活， X是以行来存储  np.maximum比较两组数据的最大值
        scores = h_output.dot(W2) + b2            #输出层（未激活）（N，C）
        
        #如果没有输入y，则直接输出得分
        if y is None:
            return scores
        
        loss = None
        
        #softmax中包含exp，值会比较大，为减小数值采取归一化操作
        shift_scores = scores - np.max(scores,axis = 1).reshape((-1,1))   #归一化，计算每一行的最大值，然后把所有的score减去该max，这是和图像归一化中每个特征减去特诊的max不一样的
        softmax_output = np.exp(shift_scores)/np.sum(np.exp(shift_scores),axis = 1).reshape(-1,1)   #softmax公式
        loss = -np.sum(np.log(softmax_output[range(N), list(y)]))         #计算代价（前面的负号是公式）
        loss/= N                                                          #计算均值
        loss += 0.5*reg*(np.sum(W1*W1)+np.sum(W2*W2))                     #正则，需要将每一层的权值加起来
        
        #输出层计算梯度，参考softmax梯度计算
        grads= {}
        dscores= softmax_output
        dscores[range(N), list(y)] -= 1
        dscores /= N
        grads['W2'] = h_output.T.dot(dscores) + reg*W2
        grads['b2'] = np.sum(dscores,axis = 0)
        
        #隐藏层梯度计算
        dh = dscores.dot(W2.T)
        dh_ReLu = (h_output>0)*dh
        grads['W1'] = X.T.dot(dh_ReLu) + reg*W1
        grads['b1'] = np.sum(dh_ReLu,axis=0)
        return loss,grads
    
    def train(self,X,y,X_val,y_val,learning_rate = 1e-3,learning_rate_decay=0.95,reg=1e-5,num_iters = 1000,batch_size=200,verbose=False):
        num_train = X.shape[0]
        iterations_per_epoch = max(num_train/batch_size,1)  #每一轮的迭代数目,1个epoch等于使用训练集中的全部样本训练一次
        loss_history =[]
        train_acc_history = []
        val_acc_history = []
        
        for it in range(num_iters):
            X_bacth = None
            y_batch = None
            
            idx = np.random.choice(num_train,batch_size,replace = True)
            X_bacth = X[idx]
            y_batch = y[idx]
            loss,grads = self.loss(X_bacth,y=y_batch,reg = reg)
            loss_history.append(loss)
            
            #参数更新
            self.params['W2'] += -learning_rate*grads['W2']
            self.params['b2'] += -learning_rate*grads['b2']
            self.params['W1'] += -learning_rate*grads['W1']
            self.params['b1'] += -learning_rate*grads['b1']
            
            if verbose and it %100 ==0:
                print('iteration %d / %d : loss %f '(it,num_iters,loss))
            
            #每次epoch说明遍历1次训练集，记录精度，并且更改学习率
            if it % iterations_per_epoch == 0:
                train_acc = (self.predict(X_bacth)==y_batch).mean()
                val_acc = np.mean(self.predict(X_val)==y_val)
                train_acc_history.append(train_acc)
                val_acc_history.append(val_acc)
                
                learning_rate*= learning_rate_decay
            
        return {
            'loss_history':loss_history,
            'train_acc_history':train_acc_history,
            'val_acc_history':val_acc_history
        }
    
    def predict(self,X):
        y_pred = None
        h = np.maximum(0,X.dot(self.params['W1'])+self.params['b1'])
        scores = h.dot(self.params['W2'])+self.params['b2']
        y_pred = np.argmax(scores,axis=1)    #取得概率最大的位置
        return y_pred

数据构建以及网络训练

将之前的数据构建整合成一个方法，注意这边的归一化操作是将每个特征减去对应的特征点最大值，这和softmax中一组得分减取改组得分的最大值不一样。

def get_cifar_data(num_training =49000,num_validation=1000,num_test=1000):
    X_train,y_train,X_test,y_test = load_cifar10('cifar-10-batches-py')
    #验证集
    mask = range(num_training, num_training+num_validation)     #先取验证集，大小为最后的1000条数据
    x_val = X_train[mask]
    y_val = y_train[mask]
    #训练集
    mask = range(num_training)
    x_train = X_train[0:num_training,:,:,:]
    y_train = y_train[mask]
    #测试集
    mask = range(num_test)
    x_test = X_test[mask]
    y_test = y_test[mask]
    
    mean_image = np.mean(x_train, axis=0)
    #归一化操作，将所有的特征都减去对应的均值
    x_train -= mean_image
    x_val -= mean_image
    x_test -= mean_image

    x_train = np.reshape(x_train,(x_train.shape[0],-1))
    x_val = np.reshape(x_val,(x_val.shape[0],-1))
    x_test = np.reshape(x_test,(x_test.shape[0],-1))
    return x_train,y_train,x_val,y_val,x_test,y_test

数据构建测试

X_train,y_train,X_val,y_val,X_test,y_test = get_cifar_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print("validation data shape: ", X_val.shape)
print('validation labels shape: ', y_val.shape)
print('test data shape: ', X_test.shape)
print('test labels shape: ',y_test.shape)

"""
X_train,y_train,X_val,y_val,X_test,y_test = get_cifar_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print("validation data shape: ", X_val.shape)
print('validation labels shape: ', y_val.shape)
print('test data shape: ', X_test.shape)
print('test labels shape: ',y_test.shape)
"""

训练

input_size = 32*32*3
hidden_size = 50
num_class = 10
net = TwoLayerNet(input_size,hidden_size,num_class)
stats = net.train(X_train,y_train,X_val,y_val,learning_rate = 1e-3,learning_rate_decay=0.95,reg=1e-5,num_iters = 1000,batch_size=200,verbose=True)
val_acc = (net.predict(X_val)==y_val).mean()
print('validation accuracy:',val_acc)

"""
iteration 0 / 1000 : loss 2.302589 
iteration 100 / 1000 : loss 1.907440 
iteration 200 / 1000 : loss 1.798388 
iteration 300 / 1000 : loss 1.721173 
iteration 400 / 1000 : loss 1.639906 
iteration 500 / 1000 : loss 1.691722 
iteration 600 / 1000 : loss 1.623998 
iteration 700 / 1000 : loss 1.493039 
iteration 800 / 1000 : loss 1.543409 
iteration 900 / 1000 : loss 1.500300 
validation accuracy: 0.48
"""

#loss和accuracy的可视化
plt.subplot(211)
plt.plot(stats['loss_history'])
plt.title('loss history')
plt.xlabel('iteration')
plt.ylabel('loss')

plt.subplot(212)
plt.plot(stats['train_acc_history'],label='train')
plt.plot(stats['val_acc_history'],label = 'val')
plt.title('classfication accuracy history')
plt.xlabel('epoch')
plt.ylabel('classfication accuracy')
plt.show()

权重矩阵的可视化

from visualize_grid import *
def show_net_weights(net):
    W1 = net.params['W1']
    W1 = W1.reshape(32,32,3,-1).transpose(3,0,1,2)
    plt.imshow(visualize_grid(W1,padding=3).astype('uint8'))
    plt.gca().axis('off')
    plt.show()
show_net_weights(net)

交叉验证参数优化

#交叉验证
input_size=32*32*3
num_class = 10
hidden_size=[75,100,125]
results={}
best_val_acc= 0
best_net=None
learning_rates = np.array([0.7,0.8,0.9])*1e-3
regulation_strengths=[0.75,1.0,1.25]
print('run')
for hs in hidden_size:
    for lr in learning_rates:
        for reg in regulation_strengths:
            net = TwoLayerNet(input_size,hs,num_class)
            
            stats=net.train(X_train,y_train,X_val,y_val,learning_rate = lr,learning_rate_decay=0.95,reg=reg,num_iters = 1000,batch_size=200,verbose=False)
            val_acc=np.mean(net.predict(X_val)==y_val)
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_net = net
            results[(hs,lr,reg)]=val_acc
print('finish')

for hs,lr,reg in sorted(results):
    val_acc = results
    print('hs %d lr %e reg %e val accuract: %f' % (hs,lr,reg,val_acc))
print('best validation accuracy achived during cross_validation:%f' %best_val_acc)

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)