我只是感觉不把这个发出来对不起我自己。
Theano 是一个神奇的东西,神奇到各种看不懂。看它的使用文档都看了好几天,然后越看越晕,最后一怒之下删掉了。
# -*- coding: utf-8 -*-
"""This tutorial introduces the LeNet5 neural network architecture
using Theano. LeNet5 is a convolutional neural network, good for
classifying images. This tutorial shows how to build the architecture,
and comes with all the hyper-parameters you need to reproduce the
paper's MNIST results.
This implementation simplifies the model in the following ways:
- LeNetConvPool doesn't implement location-specific gain and bias parameters
- LeNetConvPool doesn't implement pooling by average, it implements pooling
by max.
- Digit classification is implemented with a logistic regression rather than
an RBF network
- LeNet5 was not fully-connected convolutions at second layer
References:
- Y. LeCun, L. Bottou, Y. Bengio and P. Haffner:
Gradient-Based Learning Applied to Document
Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
Chinese comment:
- Yu Hao:
[email protected]
"""
from theano.tensor.nnet import conv
from theano.tensor.signal import downsample
import cPickle
import gzip
import numpy
import os
import sys
import theano
import time
from logistic_sgd import LogisticRegression, load_data
from mlp import HiddenLayer
import theano.tensor as T
class LeNetConvPoolLayer( object ):
"""Pool Layer of a convolutional network """
def __init__( self, rng, input, filter_shape, image_shape, poolsize = ( 2, 2 ) ):
"""
Allocate a LeNetConvPoolLayer with shared variable internal parameters.
:type rng: numpy.random.RandomState
:param rng: a random number generator used to initialize weights
:type input: theano.tensor.dtensor4
:param input: symbolic image tensor, of shape image_shape
:type filter_shape: tuple or list of length 4
:param filter_shape: (number of filters, num input feature maps,
filter height,filter width)
:type image_shape: tuple or list of length 4
:param image_shape: (batch size, num input feature maps,
image height, image width)
:type poolsize: tuple or list of length 2
:param poolsize: the downsampling (pooling) factor (#rows,#cols)
"""
# 只有当“特征图个数”和“过滤器个数”相等的时候才可以继续
assert image_shape[1] == filter_shape[1]
# 本层的输入
self.input = input
# there are "num input feature maps * filter height * filter width"
# inputs to each hidden unit
# 总共有 “输入特征图数量*过滤器高度*过滤器宽度” 个输入数据
# numpy.prod 是个迭代的乘法
fan_in = numpy.prod( filter_shape[1:] )
# each unit in the lower layer receives a gradient from:
# "num output feature maps * filter height * filter width" pooling size
# 每一个底层的单元可以收到“输出特征图数量*过滤器高度*过滤器宽度/池化大小“的梯度
# NOTICE: 什么意思?
fan_out = ( filter_shape[0] * numpy.prod( filter_shape[2:] ) / numpy.prod( poolsize ) )
# initialize weights with random weights
# 用随机数初始化权重W
W_bound = numpy.sqrt( 6. / ( fan_in + fan_out ) )
self.W = theano.shared( numpy.asarray(
rng.uniform( low = -W_bound, high = W_bound, size = filter_shape ),
dtype = theano.config.floatX ),
borrow = True )
# the bias is a 1D tensor -- one bias per output feature map
# 使用零向量初始化一个另一个参数b
b_values = numpy.zeros( ( filter_shape[0], ), dtype = theano.config.floatX )
self.b = theano.shared( value = b_values, borrow = True )
# convolve input feature maps with filters
# 使用过滤器对输入特征图进行卷积
# 参数:rng 随机数
# 参数:input 输入
# 取值:input
# 参数:filters
# 取值:self.W
# 含义:权重
# 参数:filter_shape
# 取值:filter_shape
# 含义:见构造函数
# 参数:image_shape
# 取值:image_shape
# 含义:见构造函数
conv_out = conv.conv2d( input = input, filters = self.W,
filter_shape = filter_shape, image_shape = image_shape )
# downsample each feature map individually, using maxpooling
# 对每个特征图进行最大化池化。
# NOTICE:查询 downsample.max_pool_2d 的用法
pooled_out = downsample.max_pool_2d( input = conv_out,
ds = poolsize, ignore_border = True )
# add the bias term. Since the bias is a vector (1D array), we first
# reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
# thus be broadcasted across mini-batches and feature map
# width & height
# 本层的加偏置输出。
# NOTICE: 查询 dimshuffle 的用法
self.output = T.tanh( pooled_out + self.b.dimshuffle( 'x', 0, 'x', 'x' ) )
# store parameters of this layer
# 记录下本层的权重参数
self.params = [self.W, self.b]
def evaluate_lenet5( learning_rate = 0.1, # 学习率
n_epochs = 200, # 训练“代”数
dataset = 'mnist.pkl.gz', # 数据集
nkerns = [20, 50], # 每一层卷积核个数
batch_size = 500 # 每一批次数据个数
):
""" Demonstrates lenet on MNIST dataset
:type learning_rate: float
:param learning_rate: learning rate used (factor for the stochastic
gradient)
:type n_epochs: int
:param n_epochs: maximal number of epochs to run the optimizer
:type dataset: string
:param dataset: path to the dataset used for training /testing (MNIST here)
:type nkerns: list of ints
:param nkerns: number of kernels on each layer
"""
# 随机数
rng = numpy.random.RandomState( 23455 )
# 读数据
datasets = load_data( dataset )
# 数据集分为六部分:
# 1. 训练(50000)(x:图片[矩阵] y:标签[数组])
# 2. 验证(10000)(x:图片[矩阵] y:标签[数组])
# 3. 测试(10000)(x:图片[矩阵] y:标签[数组])
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
# compute number of minibatches for training, validation and testing
# 计算出有多少训练批次:
# 第一步:获取总数
# shape结果:(A,B)。其中A为样本数量,B为24*24=784,是每个图片的像素
n_train_batches = train_set_x.get_value( borrow = True ).shape[0]
n_valid_batches = valid_set_x.get_value( borrow = True ).shape[0]
n_test_batches = test_set_x.get_value( borrow = True ).shape[0]
# 第二步:除法得到训练批次数
n_train_batches /= batch_size
n_valid_batches /= batch_size
n_test_batches /= batch_size
# allocate symbolic variables for the data
# index 的作用相当于迭代器。指示当前是用的哪个批次
index = T.lscalar() # index to a [mini]batch
# 下面x和y都是符号表达式,在以后的应用中代入具体值
x = T.matrix( 'x' ) # the data is presented as rasterized images
y = T.ivector( 'y' ) # the labels are presented as 1D vector of [int] labels
# 图片的大小28*28(这个参数以后没有被用到)
ishape = ( 28, 28 ) # this is the size of MNIST images
######################
# BUILD ACTUAL MODEL #
######################
print ( '... building the model' )
# Reshape matrix of rasterized images of shape (batch_size,28*28)
# to a 4D tensor, compatible with our LeNetConvPoolLayer
# 将输入数据转换为合适的输入。
# tensor.reshape(x, newshape, ndim=None)
# 转换前:50000*728
# 转换后:batch_size*1*28*28=500*28*28
# NOTICE: 这里可能是 n_batches*batch_size*1*28*28 。待查参考手册。
layer0_input = x.reshape( ( batch_size, 1, 28, 28 ) )
# Construct the first convolutional pooling layer:
# filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
# maxpooling reduces this further to (24/2,24/2) = (12,12)
# 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
# 卷积池化层Layer0的模型构造。
# 参数:rng 随机数
# 参数:input 本层输入
# 取值:layer0_input ,见上面 x.reshape( ( batch_size, 1, 28, 28 ) )
# 参数:image_shape
# 取值:( batch_size, 1, 28, 28 ) = (500, 1, 28, 28) = 500*1*28*28
# 含义:每个批次大小,特征图数量,图片高度,图片宽度
# 参数:filter_shape
# 取值: ( nkerns[0], 1, 5, 5 ) = (20, 1, 5, 5) = 20*1*5*5
# 含义:过滤器个数,特征图数量,过滤器高度,过滤器宽度
# 参数:poolsize
# 取值:(2, 2) = 2*2
# 含义:池化高度,池化宽度
# 本层输出:output
# 参数: ( batch_size, nkerns[0], 12, 12 ) = (500, 20, 12, 12) = 500*20*12*12
layer0 = LeNetConvPoolLayer( rng, input = layer0_input,
image_shape = ( batch_size, 1, 28, 28 ),
filter_shape = ( nkerns[0], 1, 5, 5 ), poolsize = ( 2, 2 ) )
# Construct the second convolutional pooling layer
# filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
# maxpooling reduces this further to (8/2,8/2) = (4,4)
# 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
# 卷积池化层Layer1的模型构造。
# 参数:rng 随机数
# 参数:input 本层输入
# 取值:layer0.output
# 参数:image_shape
# 取值: ( batch_size, nkerns[0], 12, 12 ) = (500, 20, 12, 12) = 500*20*12*12
# 含义:每个批次大小,特征图数量,图片高度,图片宽度
# 参数:filter_shape
# 取值: ( nkerns[1], nkerns[0], 5, 5 ) = (50, 20, 5, 5) = 50*20*5*5
# 含义:过滤器个数,特征图数量,过滤器高度,过滤器宽度
# 参数:poolsize
# 取值:(2, 2) = 2*2
# 含义:池化高度,池化宽度
# 本层输出:output
# 参数:
layer1 = LeNetConvPoolLayer( rng, input = layer0.output,
image_shape = ( batch_size, nkerns[0], 12, 12 ),
filter_shape = ( nkerns[1], nkerns[0], 5, 5 ), poolsize = ( 2, 2 ) )
# the HiddenLayer being fully-connected, it operates on 2D matrices of
# shape (batch_size,num_pixels) (i.e matrix of rasterized images).
# This will generate a matrix of shape (20,32*4*4) = (20,512)
# 将Layer1的输入层拉平
layer2_input = layer1.output.flatten( 2 )
# construct a fully-connected sigmoidal layer
# 全连接隐藏层layer2的模型构造
# 参数:rng 随机数
# 参数:input 本层输入
# 取值:layer2_input
# 参数:n_in
# 取值: nkerns[1] * 4 * 4
# 含义:
# 参数:n_out
# 取值:500
# 含义:
# 参数:activation
# 取值:T.tanh
# 含义:本层激活函数
layer2 = HiddenLayer( rng, input = layer2_input, n_in = nkerns[1] * 4 * 4,
n_out = 500, activation = T.tanh )
# classify the values of the fully-connected sigmoidal layer
# Logistic回归layer3的模型构造
# 参数:input 本层输入
# 取值:layer2.output
# 参数:n_in
# 取值: 500
# 含义:
# 参数:n_out
# 取值:10
# 含义:分类器的输出数量(使用mnist数据集,输出肯定是0~9共计10个)
layer3 = LogisticRegression( input = layer2.output, n_in = 500, n_out = 10 )
# the cost we minimize during training is the NLL of the model
# 代价函数
cost = layer3.negative_log_likelihood( y )
# create a function to compute the mistakes that are made by the model
# 开始创建模型。用Theano.function简化操作。
# 使用givens来覆盖全局变量
# index * batch_size: ( index + 1 ) * batch_size 含义:
# 这是 Python 的分片操作
# index 表示“现在用第几个批次”。比如是第100个批次,
# 那么本次会使用 index * batch_size~( index + 1 ) * batch_size 进行
# 也加是说使用编号为 500 ~ 599 的数据
# 输入:index
# 输出:layer3.errors(y)
test_model = theano.function( [index], layer3.errors( y ),
givens = {
x: test_set_x[index * batch_size: ( index + 1 ) * batch_size],
y: test_set_y[index * batch_size: ( index + 1 ) * batch_size]} )
validate_model = theano.function( [index], layer3.errors( y ),
givens = {
x: valid_set_x[index * batch_size: ( index + 1 ) * batch_size],
y: valid_set_y[index * batch_size: ( index + 1 ) * batch_size]} )
# create a list of all model parameters to be fit by gradient descent
# 这是一个参数列表。每一层参数由 各自的params 保存,内容为[self.W, self.b]
# 将各个层的参数进行合并
# NOTICE: 这里将它们提取出来可能以后会方便修改和保存
params = layer3.params + layer2.params + layer1.params + layer0.params
# create a list of gradients for all model parameters
# 将参数对代价对各个参数进行求偏导
# NOTICE: 为什么要这样做?
grads = T.grad( cost, params )
# train_model is a function that updates the model parameters by
# SGD Since this model has many parameters, it would be tedious to
# manually create an update rule for each model parameter. We thus
# create the updates list by automatically looping over all
# (params[i],grads[i]) pairs.
# train_model是用SGD来更新模型参数的函数。因为手工更新那么多参数实在繁琐了,
# 于是我们就使用一个“更新列表”来自动循环所有的(params[i],grads[i]) 对。
# 每次调用train的时候都会使用theano.function里面的updates参数自动对updates进行更新
updates = []
for param_i, grad_i in zip( params, grads ):
updates.append( ( param_i, param_i - learning_rate * grad_i ) )
# 这是用于训练的模型。参数含义同上
# NOTICE:: Theano.function的用法
# 输入:index
# 输出:cost
train_model = theano.function( [index], cost, updates = updates,
givens = {
x: train_set_x[index * batch_size: ( index + 1 ) * batch_size],
y: train_set_y[index * batch_size: ( index + 1 ) * batch_size]} )
###############
# TRAIN MODEL #
###############
print ( '... training' )
# early-stopping parameters
# 这个patience(忍耐度/阈值)可以用来提前结束训练。
# look as this many examples regardless
patience = 10000
# wait this much longer when a new best is found
# 当得到一个新的best的时候,将训练次数加倍,作为“等待”
patience_increase = 2
# a relative improvement of this much is considered significant
improvement_threshold = 0.995
# go through this many minibatche before checking the network on the validation set; in this case we check every epoch
# 验证频率
validation_frequency = min( n_train_batches, patience / 2 )
# 记录下最优情况下的参数(后面没有用到)
best_params = None
# 记录下最优情况下的代价
best_validation_loss = numpy.inf
best_iter = 0
test_score = 0.
# 开始记时
start_time = time.clock()
# 初始化训练“代”数:0。这里epoch就相当于循环变量i
epoch = 0
# 提前结束循环标志
done_looping = False
while ( epoch < n_epochs ) and ( not done_looping ):
epoch = epoch + 1
# xrange(foo)将会产生一个[0..foo-1]的列表(Python语法)。
# for (int minibatch_index=0; minibatch_index < n_train_batches; minibatch_index++ )
for minibatch_index in xrange( n_train_batches ):
# iter就是个计数器,到目前为止的训练次数
iter = ( epoch - 1 ) * n_train_batches + minibatch_index
# 输出一个提示信息(可忽略)
if iter % 100 == 0:
print ( 'training @ iter = ', iter )
##### ##### ##### ##### ##### #####
# 注意:!在这里调用了训练函数!
# 使用minibatch_index这个批次的所有的训练数据进行训练
cost_ij = train_model( minibatch_index )
##### ##### ##### ##### ##### #####
# 如果达到了设定的验证阶段,那么就开始进行验证
# “期中测试”/“阶段测试”
if ( iter + 1 ) % validation_frequency == 0:
# 使用验证数据进行验证,获得一个0-1验证错误率。
# compute zero-one loss on validation set
##### ##### ##### ##### ##### #####
# 注意:!在这里调用了验证函数!
# 对所有的“验证数据集”(10000个)都进行验证
validation_losses = [validate_model( i ) for i in xrange( n_valid_batches )]
##### ##### ##### ##### ##### #####
# 取平均值作为本次验证的精度
this_validation_loss = numpy.mean( validation_losses )
print( 'epoch %i, minibatch %i/%i, validation error %f %%' % \
( epoch, minibatch_index + 1, n_train_batches, \
this_validation_loss * 100. ) )
# if we got the best validation score until now
# 如果这次的精度有所提高
if this_validation_loss < best_validation_loss:
# improve patience if loss improvement is good enough
# 如果精度提高得足够多,那么增加阈值(变为 当前训练次数的patience_increase倍)
if this_validation_loss < best_validation_loss * improvement_threshold:
patience = max( patience, iter * patience_increase )
# 将这次的精度信息进行保存
# save best validation score and iteration number
best_validation_loss = this_validation_loss
best_iter = iter
# test it on the test set
# 使用测试数据集进行测试
##### ##### ##### ##### ##### #####
# 注意:!在这里调用了测试函数!
# 对所有的“测试数据集”(10000个)都进行测试
test_losses = [test_model( i ) for i in xrange( n_test_batches )]
##### ##### ##### ##### ##### #####
# 获得测试精度
test_score = numpy.mean( test_losses )
print( ( ' epoch %i, minibatch %i/%i, test error of best '
'model %f %%' ) %
( epoch, minibatch_index + 1, n_train_batches,
test_score * 100. ) )
# 如果iter大于了阈值,那么结束本层for循环,同时使用done_looping结束外层while循环
if patience <= iter:
done_looping = True
break
# 计算总共用时
end_time = time.clock()
# 输出信息
print( 'Optimization complete.' )
print( 'Best validation score of %f %% obtained at iteration %i,'\
'with test performance %f %%' %
( best_validation_loss * 100., best_iter + 1, test_score * 100. ) )
# 输出所用时间
print >> sys.stderr, ( 'The code for file ' +
os.path.split( __file__ )[1] +
' ran for %.2fm' % ( ( end_time - start_time ) / 60. ) )
if __name__ == '__main__':
evaluate_lenet5()
def experiment( state, channel ):
evaluate_lenet5( state.learning_rate, dataset = state.dataset )
# 中文注释 by jinyu121
发表回复