当前位置：网站首页>Self taught neural network series - 4 learning of neural network

Self taught neural network series - 4 learning of neural network

2022-06-26 09:09:00 【ML_ python_ get√】

Learning neural networks

4.1 Loss function
4.2 Numerical differentiation
4.3 Two layer neural network learning
4.4 Evaluation of neural network

4.1 Loss function

least square
Maximum likelihood
Cross entropy

#  Least squares loss function 
import numpy as np

y1 = [0.1,0.05,0.6,0.0,0.05,0.1,0.0,0.1,0.0,0.0]
y2 = [0.1,0.05,0.1,0.0,0.05,0.1,0.0,0.6,0.0,0.0]
t = [0,0,1,0,0,0,0,0,0,0]

def mean_squared_error(y,t):
    return 0.5*np.sum((y-t)**2)

# list Can't reduce   Convert to array 
mean_squared_error(np.array(y1),np.array(t))
#  Forecast classification as index 2 The mean square error of is 0.09750000000000003
mean_squared_error(np.array(y2),np.array(t))
#  Forecast classification as index 7 The mean square error of is 0.5975

#  Cross entropy loss function 
#  The amount of information is inversely proportional to the probability , The smaller the probability , More information is required to become a deterministic event 
#  Entropy is a measure of the degree of chaos in a system , Is the expected value of the amount of information in the system 
#  Measure whether the two systems are close , Measure whether the entropy difference between two systems is 0
#  When a system is in an ideal state , Just measure the cross entropy 
#  Gibbs inequality ： Cross entropy is greater than entropy  （ Comes at a time ）
#  So the smaller the cross entropy , The closer the two systems are 
#  So cross entropy is consistent with the definition of loss function 
#  The greater the probability of output , The smaller the loss function 

def cross_entropy_error0(y,t):
    delta = 1e-7
    return -np.sum(t*np.log(y+delta)) #  Multiply the corresponding elements ,delta The probability of prevention is 0 Can't calculate 

t = [0,0,1,0,0,0,0,0,0,0]
y1 = [0.1,0.05,0.6,0.0,0.05,0.1,0.0,0.1,0.0,0.0]
y2 = [0.1,0.05,0.1,0.0,0.05,0.1,0.0,0.6,0.0,0.0]
cross_entropy_error0(np.array(y1), np.array(t))
cross_entropy_error0(np.array(y2), np.array(t))
# 0.510825457099338
# 2.302584092994546

# min-batch Study 
import sys,os
sys.path.append(os.pardir)
import numpy as np
from res.mnist import load_mnist

(x_train,t_train),(x_test,t_test) =\
    load_mnist(normalize=True,one_hot_label=True)
print(x_train.shape)
print(t_train.shape)
#  Random sampling 10 Pen observations 
train_size = x_train.shape[0]
batch_size =10
batch_mask = np.random.choice(train_size,batch_size) # From the specified sample , Select a random sample index 
x_batch = x_train[batch_mask]
t_baych = t_train[batch_mask]
# one-hot code 
def cross_entropy_error(y,t):
    if y.ndim == 1:
        t = t.reshape(1,t.size)
        y = y.reshape(1,y.size)
    
    batch_size = y.shape[0]
    return  -np.sum(t*np.log(y+1e-7))/batch_size

#  label 
#  The cross entropy corresponding to the single heat coding is only encoded as 1 The element of the , So just find the output corresponding to the tag y The cross entropy can be calculated 
def cross_entropy_label(y,t):
    if y.ndim ==1:
        t = t.reshape(1,t.size)
        y = y.reshape(1,y.size)
    
    batch_size = y.shape[0]
    #  Must be in order t,y
    return -np.sum(np.log(y[np.arange(batch_size),t]+1e-7))/batch_size
    # y[np.arange(batch_size),t]  Fancy index , Returns the... Of the tuple index y

4.2 Numerical differentiation

1、 The degree of change is infinitesimal , But the computer will omit the value after a certain decimal point , So the degree of change is generally 10^(-4)
2、 Not the derivative of a point , Generally go to x-h,x+h The differential between
3、 So it is called numerical differentiation

def numerical_diff(f,x):
    h = 1e-4
    return (f(x+h)-f(x-h))/(2*h)


def func(x):
    return 0.01*x**2 + 0.1*x

numerical_diff(func,10)


def tangent_line(f, x):
    ''' Tangent line '''
    d = numerical_diff(f, x)  #  Slope 
    y = f(x) - d*x   #  intercept ： adopt x,f(x) This point 
    return lambda t: d*t + y  #  Tangent line

#  visualization 
import numpy as np
import matplotlib.pylab as plt

x = np.arange(0.0,20.0,0.1)
y1 = func(x)
tf = tangent_line(func,10)
y2  = tf(x)
plt.plot(x,y1)
plt.plot(x,y2)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.show()

#  Partial derivative 
def func_2(x):
    return np.sum(x**2)
 
def numerical_gradient(f,x):
    ''' gradient '''
    h =1e-4
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])   # Multidimensional iterators 
    #  Iterating continuously by line , Calculate the gradient for each row , Until completion 
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val +h
        fxh1 = f(x)
        x[idx] = tmp_val -h
        fxh2 = f(x)
        grad[idx] = (fxh1-fxh2)/(2*h)
        x[idx] = tmp_val
        it.iternext() 
    
    return grad
#  Numerical differentiation is computed iteratively through known data , Different from the theoretical derivative , First calculate the formula , Re substitution 

#  Gradient descent method 
def gradient_descent(f,init_x,lr=0.01,step_num=1000):
    x = init_x
    for i in range(step_num):
        grad = numerical_gradient(f,x)
        x -= lr*grad
    return x

#  test 
numerical_gradient(func_2,np.array([3.0,4.0])) #  The results of floating-point numbers and integers are very different 
x = gradient_descent(func_2,np.array([-3.0,4.0]),lr=0.1)
func_2(x)

#  Take the neural network as an example to calculate the gradient 
def softmax(x):
    a = np.max(x)
    exp_x = np.exp(x-a)
    exp_sum = np.sum(exp_x)
    y = exp_x/exp_sum
    return y

import numpy as np

class SimpleNet:

    def __init__(self):
        self.W = np.random.randn(2,3)   # 2*3 Single layer perceptron 
    
    def predict(self,x):
        return np.dot(x,self.W)
    
    def loss(self,x,t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)
        return loss

#  test  
net = SimpleNet()
print(net.W)
x = np.array([0.6,0.9])
p = net.predict(x)
print(p)
# np.argmax(p)
t = np.array([0,0,1])
# net.loss(x,t)

# def f(W):
# return net.loss(x,t)
#  More concise anonymous functions 
f = lambda W: net.loss(x,t)

print(f(2))
print(f(1))
# W Can't pass it in 

dW = numerical_gradient(f,net.W)
print(dW)

# net.W After the incoming W+h W-h change 
# f(x+h)\f(x) Will it change ？
# f  from net.predict(x) Calculation 
#  and predict(x)  from x*self.W Calculation 
#  therefore f There is a change 
#  Contained in the class self.W All methods can be adopted in this way   Pass in the parameter

4.3 Two layer neural network learning

Random selection min-batch Calculate the loss function
Find the gradient of the loss function
Update parameters
Until the gradient is 0 Or reach the maximum number of steps

def sigmoid(x):
    return 1/(1+np.exp(-x))
    
def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

# 2 Layer neural network gradient calculation 
class TwoLayerNet:

    def __init__(self,input_size,hidden_size,output_size,weight_std=0.01): 
        #  Initialization method , That is, the method that generates the class call 
        self.params = {
    }
        self.params['W1'] = weight_std*np.random.randn(input_size,hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_std*np.random.randn(hidden_size,output_size)
        self.params['b2'] = np.zeros(output_size)
    
    def predict(self,x):
        W1,W2 = self.params['W1'],self.params['W2']
        b1,b2 = self.params['b1'],self.params['b2']

        a1 = np.dot(x,W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1,W2) + b2
        y = softmax(a2)

        return  y

    def  loss(self,x,t):
        y = self.predict(x)
        return cross_entropy_error(y, t)
    
    def accuracy(self,x,t):
        y = self.predict(x)
        y = np.argmax(y,axis=1)
        t = np.argmax(t,axis=1)
        accuracy = np.sum(y==t)/float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self,x,t):
        loss_W = lambda W: self.loss(x,t)
        grads = {
    }
        grads['W1'] = numerical_gradient(loss_W,self.params['W1'])
        grads['W2'] = numerical_gradient(loss_W,self.params['W2'])
        grads['b1'] = numerical_gradient(loss_W,self.params['b1'])
        grads['b2'] = numerical_gradient(loss_W,self.params['b2'])

        return grads
    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {
    }
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        
        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads

net = TwoLayerNet(784,100,10)
net.params['W1'].shape
net.params['b1'].shape
net.params['W2'].shape
net.params['b2'].shape
x = np.random.randn(100,784)
t = np.random.randn(100,10)
y = net.predict(x)
print(y.shape)
grads = net.gradient(x,t)
grads

from res.mnist import load_mnist

(x_train,t_train),(x_test,t_test) = load_mnist(normalize=True,one_hot_label=True)
train_loss_list = []
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
network = TwoLayerNet(784,50,10)
# min_batch choice 
for i in range(iters_num):
    batch_mask = np.random.choice(train_size,batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    #  gradient 
    grad = network.gradient(x_batch,t_batch)
    #  gradient descent 
    for key in ('W1','b1','W2','b2'):
        network.params[key] -= learning_rate * grad[key]
    #  Record the learning process 
    loss = network.loss(x_batch,t_batch)
    train_loss_list.append(loss)

4.4 Evaluation of neural network

(x_train,t_train),(x_test,t_test) = load_mnist(normalize=True,one_hot_label=True)
train_loss_list = []
train_acc_list = []
test_acc_list = []
#  Required to traverse all data sets min_batch Selection times 
iter_per_epoch = max(train_size/batch_size,1)
iters_num = 10000
batch_size = 100
learning_rate = 0.1

network = TwoLayerNet(784, 50, 10)
for i in range(iters_num):
    #  obtain min_batch
    batch_mask = np.random.choice(train_size,batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    #  gradient 
    grad = network.gradient(x_batch,t_batch)
    #  gradient descent 
    for  key in ('W1','b1','W2','b2'):
        network.params[key] -= grad[key]*learning_rate
        loss = network.loss(x_batch,t_batch)
        train_loss_list.append(loss)
    #  Calculate each epoch The recognition accuracy of 
    if i%iter_per_epoch==0:
        #  Just traverse the sample once 
        train_acc = network.accuracy(x_train,t_train)
        test_acc = network.accuracy(x_test,t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc |" + str(train_acc)+" , "+str(test_acc))

x = np.arange(len(train_acc_list))
plt.plot(x,train_acc_list,label='train_acc')
plt.plot(x,test_acc_list,label='test_acc',linestyle="--")
plt.xlabel('epochs')
plt.ylabel('accuracy')