convolutional_mlp.py 注释

我只是感觉不把这个发出来对不起我自己。
Theano 是一个神奇的东西，神奇到各种看不懂。看它的使用文档都看了好几天，然后越看越晕，最后一怒之下删掉了。
# -*- coding: utf-8 -*-
"""This tutorial introduces the LeNet5 neural network architecture
using Theano.  LeNet5 is a convolutional neural network, good for
classifying images. This tutorial shows how to build the architecture,
and comes with all the hyper-parameters you need to reproduce the
paper's MNIST results.


This implementation simplifies the model in the following ways:

 - LeNetConvPool doesn't implement location-specific gain and bias parameters
 - LeNetConvPool doesn't implement pooling by average, it implements pooling
   by max.
 - Digit classification is implemented with a logistic regression rather than
   an RBF network
 - LeNet5 was not fully-connected convolutions at second layer

References:
 - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner:
   Gradient-Based Learning Applied to Document
   Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
   http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf

Chinese comment:
 - Yu Hao:
   [email protected]

"""
from theano.tensor.nnet import conv
from theano.tensor.signal import downsample
import cPickle
import gzip
import numpy
import os
import sys
import theano
import time

from logistic_sgd import LogisticRegression, load_data
from mlp import HiddenLayer
import theano.tensor as T


class LeNetConvPoolLayer( object ):
    """Pool Layer of a convolutional network """

    def __init__( self, rng, input, filter_shape, image_shape, poolsize = ( 2, 2 ) ):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height,filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)

        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
        """
        # 只有当 “特征图个数” 和 “过滤器个数” 相等的时候才可以继续
        assert image_shape[1] == filter_shape[1]
        # 本层的输入
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        # 总共有 “输入特征图数量*过滤器高度*过滤器宽度” 个输入数据
        # numpy.prod 是个迭代的乘法
        fan_in = numpy.prod( filter_shape[1:] )

        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" pooling size
        # 每一个底层的单元可以收到 “输出特征图数量*过滤器高度*过滤器宽度/池化大小 “的梯度
        # NOTICE： 什么意思？
        fan_out = ( filter_shape[0] * numpy.prod( filter_shape[2:] ) / numpy.prod( poolsize ) )

        # initialize weights with random weights
        # 用随机数初始化权重 W
        W_bound = numpy.sqrt( 6. / ( fan_in + fan_out ) )
        self.W = theano.shared( numpy.asarray( 
            rng.uniform( low = -W_bound, high = W_bound, size = filter_shape ),
            dtype = theano.config.floatX ),
            borrow = True )

        # the bias is a 1D tensor -- one bias per output feature map
        # 使用零向量初始化一个另一个参数 b
        b_values = numpy.zeros( ( filter_shape[0], ), dtype = theano.config.floatX )
        self.b = theano.shared( value = b_values, borrow = True )

        # convolve input feature maps with filters
        # 使用过滤器对输入特征图进行卷积
        # 参数：rng 随机数
        # 参数：input 输入
        #        取值：input
        # 参数：filters
        #        取值：self.W
        #        含义：权重
        # 参数：filter_shape
        #        取值：filter_shape
        #        含义：见构造函数
        # 参数：image_shape
        #        取值：image_shape
        #        含义：见构造函数
        conv_out = conv.conv2d( input = input, filters = self.W,
                filter_shape = filter_shape, image_shape = image_shape )

        # downsample each feature map individually, using maxpooling
        # 对每个特征图进行最大化池化。
        # NOTICE：查询 downsample.max_pool_2d 的用法
        pooled_out = downsample.max_pool_2d( input = conv_out,
                                            ds = poolsize, ignore_border = True )

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        # 本层的加偏置输出。
        # NOTICE： 查询 dimshuffle 的用法
        self.output = T.tanh( pooled_out + self.b.dimshuffle( 'x', 0, 'x', 'x' ) )

        # store parameters of this layer
        # 记录下本层的权重参数
        self.params = [self.W, self.b]


def evaluate_lenet5( learning_rate = 0.1,    # 学习率
                     n_epochs = 200,    # 训练 “代” 数
                     dataset = 'mnist.pkl.gz',    # 数据集
                     nkerns = [20, 50],    # 每一层卷积核个数
                     batch_size = 500    # 每一批次数据个数
                     ):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    # 随机数
    rng = numpy.random.RandomState( 23455 )
    # 读数据
    datasets = load_data( dataset )
    # 数据集分为六部分：
    # 1. 训练（50000）（x: 图片 [矩阵]     y: 标签 [数组]）
    # 2. 验证（10000）（x: 图片 [矩阵]     y: 标签 [数组]）
    # 3. 测试（10000）（x: 图片 [矩阵]     y: 标签 [数组]）
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    # 计算出有多少训练批次：
    # 第一步：获取总数
    # shape 结果：（A，B）。其中 A 为样本数量，B 为 24*24=784，是每个图片的像素
    n_train_batches = train_set_x.get_value( borrow = True ).shape[0]
    n_valid_batches = valid_set_x.get_value( borrow = True ).shape[0]
    n_test_batches = test_set_x.get_value( borrow = True ).shape[0]
    # 第二步：除法得到训练批次数
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    # index 的作用相当于迭代器。指示当前是用的哪个批次
    index = T.lscalar()    # index to a [mini]batch
    # 下面 x 和 y 都是符号表达式，在以后的应用中代入具体值
    x = T.matrix( 'x' )    # the data is presented as rasterized images
    y = T.ivector( 'y' )    # the labels are presented as 1D vector of [int] labels

    # 图片的大小 28*28（这个参数以后没有被用到）
    ishape = ( 28, 28 )    # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print ( '... building the model' )

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # 将输入数据转换为合适的输入。
    # tensor.reshape(x, newshape, ndim=None)
    # 转换前：50000*728
    # 转换后：batch_size*1*28*28=500*28*28
    # NOTICE: 这里可能是 n_batches*batch_size*1*28*28 。待查参考手册。
    layer0_input = x.reshape( ( batch_size, 1, 28, 28 ) )

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    # 卷积池化层 Layer0 的模型构造。
    # 参数：rng 随机数
    # 参数：input 本层输入
    #        取值：layer0_input ，见上面 x.reshape( ( batch_size, 1, 28, 28 ) )
    # 参数：image_shape
    #        取值：( batch_size, 1, 28, 28 ) = (500, 1, 28, 28) = 500*1*28*28
    #        含义：每个批次大小，特征图数量，图片高度，图片宽度
    # 参数：filter_shape
    #        取值： ( nkerns[0], 1, 5, 5 ) = (20, 1, 5, 5) = 20*1*5*5
    #        含义：过滤器个数，特征图数量，过滤器高度，过滤器宽度
    # 参数：poolsize
    #        取值：(2, 2) = 2*2
    #        含义：池化高度，池化宽度
    # 本层输出：output
    #        参数： ( batch_size, nkerns[0], 12, 12 ) = (500, 20, 12, 12) = 500*20*12*12
    layer0 = LeNetConvPoolLayer( rng, input = layer0_input,
            image_shape = ( batch_size, 1, 28, 28 ),
            filter_shape = ( nkerns[0], 1, 5, 5 ), poolsize = ( 2, 2 ) )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    # 卷积池化层 Layer1 的模型构造。
    # 参数：rng 随机数
    # 参数：input 本层输入
    #        取值：layer0.output
    # 参数：image_shape
    #        取值： ( batch_size, nkerns[0], 12, 12 ) = (500, 20, 12, 12) = 500*20*12*12
    #        含义：每个批次大小，特征图数量，图片高度，图片宽度
    # 参数：filter_shape
    #        取值： ( nkerns[1], nkerns[0], 5, 5 ) = (50, 20, 5, 5) = 50*20*5*5
    #        含义：过滤器个数，特征图数量，过滤器高度，过滤器宽度
    # 参数：poolsize
    #        取值：(2, 2) = 2*2
    #        含义：池化高度，池化宽度
    # 本层输出：output
    #        参数：
    layer1 = LeNetConvPoolLayer( rng, input = layer0.output,
            image_shape = ( batch_size, nkerns[0], 12, 12 ),
            filter_shape = ( nkerns[1], nkerns[0], 5, 5 ), poolsize = ( 2, 2 ) )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    # 将 Layer1 的输入层拉平
    layer2_input = layer1.output.flatten( 2 )

    # construct a fully-connected sigmoidal layer
    # 全连接隐藏层 layer2 的模型构造
    # 参数：rng 随机数
    # 参数：input 本层输入
    #        取值：layer2_input
    # 参数：n_in
    #        取值： nkerns[1] * 4 * 4
    #        含义：
    # 参数：n_out
    #        取值：500
    #        含义：
    # 参数：activation
    #        取值：T.tanh
    #        含义：本层激活函数
    layer2 = HiddenLayer( rng, input = layer2_input, n_in = nkerns[1] * 4 * 4,
                         n_out = 500, activation = T.tanh )

    # classify the values of the fully-connected sigmoidal layer
    # Logistic 回归 layer3 的模型构造
    # 参数：input 本层输入
    #        取值：layer2.output
    # 参数：n_in
    #        取值： 500
    #        含义：
    # 参数：n_out
    #        取值：10
    #        含义：分类器的输出数量（使用 mnist 数据集，输出肯定是 0～9 共计 10 个）
    layer3 = LogisticRegression( input = layer2.output, n_in = 500, n_out = 10 )

    # the cost we minimize during training is the NLL of the model
    # 代价函数
    cost = layer3.negative_log_likelihood( y )

    # create a function to compute the mistakes that are made by the model
    # 开始创建模型。用 Theano.function 简化操作。
    # 使用 givens 来覆盖全局变量

    # index * batch_size: ( index + 1 ) * batch_size 含义：
    #        这是 Python 的分片操作
    #        index 表示 “现在用第几个批次” 。比如是第 100 个批次，
    #        那么本次会使用 index * batch_size～( index + 1 ) * batch_size 进行
    #        也加是说使用编号为 500 ～ 599 的数据

    # 输入：index
    # 输出：layer3.errors(y)

    test_model = theano.function( [index], layer3.errors( y ),
             givens = {
                x: test_set_x[index * batch_size: ( index + 1 ) * batch_size],
                y: test_set_y[index * batch_size: ( index + 1 ) * batch_size]} )

    validate_model = theano.function( [index], layer3.errors( y ),
            givens = {
                x: valid_set_x[index * batch_size: ( index + 1 ) * batch_size],
                y: valid_set_y[index * batch_size: ( index + 1 ) * batch_size]} )

    # create a list of all model parameters to be fit by gradient descent
    # 这是一个参数列表。每一层参数由 各自的 params 保存，内容为 [self.W, self.b]
    # 将各个层的参数进行合并
    # NOTICE: 这里将它们提取出来可能以后会方便修改和保存
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    # 将参数对代价对各个参数进行求偏导
    # NOTICE: 为什么要这样做？
    grads = T.grad( cost, params )

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    # train_model 是用 SGD 来更新模型参数的函数。因为手工更新那么多参数实在繁琐了，
    # 于是我们就使用一个 “更新列表” 来自动循环所有的 (params[i],grads[i]) 对。
    # 每次调用 train 的时候都会使用 theano.function 里面的 updates 参数自动对 updates 进行更新
    updates = []
    for param_i, grad_i in zip( params, grads ):
        updates.append( ( param_i, param_i - learning_rate * grad_i ) )

    # 这是用于训练的模型。参数含义同上
    # NOTICE:： Theano.function 的用法
    # 输入：index
    # 输出：cost
    train_model = theano.function( [index], cost, updates = updates,
          givens = {
            x: train_set_x[index * batch_size: ( index + 1 ) * batch_size],
            y: train_set_y[index * batch_size: ( index + 1 ) * batch_size]} )

    ###############
    # TRAIN MODEL #
    ###############
    print ( '... training' )

    # early-stopping parameters
    # 这个 patience（忍耐度/阈值）可以用来提前结束训练。
    # look as this many examples regardless
    patience = 10000
    # wait this much longer when a new best is found
    # 当得到一个新的 best 的时候，将训练次数加倍，作为 “等待”
    patience_increase = 2
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995
    # go through this many minibatche before checking the network on the validation set; in this case we check every epoch
    # 验证频率
    validation_frequency = min( n_train_batches, patience / 2 )

    # 记录下最优情况下的参数（后面没有用到）
    best_params = None
    # 记录下最优情况下的代价
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    # 开始记时
    start_time = time.clock()

    # 初始化训练 “代” 数：0 。这里 epoch 就相当于循环变量 i
    epoch = 0
    # 提前结束循环标志
    done_looping = False

    while ( epoch < n_epochs ) and ( not done_looping ):
        epoch = epoch + 1
        # xrange(foo) 将会产生一个 [0..foo-1] 的列表（Python 语法）。
        # for (int minibatch_index=0; minibatch_index < n_train_batches; minibatch_index++ )
        for minibatch_index in xrange( n_train_batches ):
            # iter 就是个计数器，到目前为止的训练次数
            iter = ( epoch - 1 ) * n_train_batches + minibatch_index
            # 输出一个提示信息（可忽略）
            if iter % 100 == 0:
                print ( 'training @ iter = ', iter )

            #####    #####    #####    #####    #####    #####
            # 注意：！在这里调用了训练函数！
            # 使用 minibatch_index 这个批次的所有的训练数据进行训练
            cost_ij = train_model( minibatch_index )
            #####    #####    #####    #####    #####    #####

            # 如果达到了设定的验证阶段，那么就开始进行验证
            # “期中测试”/“阶段测试”
            if ( iter + 1 ) % validation_frequency == 0:
                # 使用验证数据进行验证，获得一个 0-1 验证错误率。
                # compute zero-one loss on validation set

                #####    #####    #####    #####    #####    #####
                # 注意：！在这里调用了验证函数！
                # 对所有的 “验证数据集”（10000 个）都进行验证
                validation_losses = [validate_model( i ) for i in xrange( n_valid_batches )]
                #####    #####    #####    #####    #####    #####

                # 取平均值作为本次验证的精度
                this_validation_loss = numpy.mean( validation_losses )
                print( 'epoch %i, minibatch %i/%i, validation error %f %%' % \
                      ( epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100. ) )

                # if we got the best validation score until now
                # 如果这次的精度有所提高
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    # 如果精度提高得足够多，那么增加阈值（变为 当前训练次数的 patience_increase 倍）
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max( patience, iter * patience_increase )

                    # 将这次的精度信息进行保存
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    # 使用测试数据集进行测试

                    #####    #####    #####    #####    #####    #####
                    # 注意：！在这里调用了测试函数！
                    # 对所有的 “测试数据集”（10000 个）都进行测试
                    test_losses = [test_model( i ) for i in xrange( n_test_batches )]
                    #####    #####    #####    #####    #####    #####

                    # 获得测试精度
                    test_score = numpy.mean( test_losses )
                    print( ( '     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%' ) %
                          ( epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100. ) )
            # 如果 iter 大于了阈值，那么结束本层 for 循环，同时使用 done_looping 结束外层 while 循环
            if patience <= iter:
                done_looping = True
                break
    # 计算总共用时
    end_time = time.clock()
    # 输出信息
    print( 'Optimization complete.' )
    print( 'Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          ( best_validation_loss * 100., best_iter + 1, test_score * 100. ) )
    # 输出所用时间
    print >> sys.stderr, ( 'The code for file ' +
                          os.path.split( __file__ )[1] +
                          ' ran for %.2fm' % ( ( end_time - start_time ) / 60. ) )

if __name__ == '__main__':
    evaluate_lenet5()


def experiment( state, channel ):
    evaluate_lenet5( state.learning_rate, dataset = state.dataset )

# 中文注释 by jinyu121
分享到: