BackPropagationLearningをSIMDに対応してみた

Accord.NET(AForge.NET)のBackPropagationLearningをSIMDを使用するように修正してみた。
速度的には以前に作ったマルチスレッド対応版のBackPropagationLearningの1.2倍程度の早さになってます。
簡略版なので並列数は4に固定。したがってAVX非対応のCPUだと動きません。
並列数2だと、おそらく速度的に変わらないので作るつもりもありません。
DeepBeliefNetworkなど関連するクラスも合わせて修正すればもっと早くなるのだろうけど、とりあえずBackPropagationLearningのみの対応。流石に全部はボリューム多すぎるし・・・orz

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Threading;
using System.Numerics;
using System.Diagnostics;

using AForge.Neuro;
using AForge.Neuro.Learning;

namespace LernDetectImage
{
    class SimdBackPropagationLearning : BackPropagationLearning
    {
        // network to teach
        private ActivationNetwork network;

        // learning rate
        private double learningRate = 0.1;

        // momentum
        private double momentum = 0.0;

        // neuron's errors
        private Vector<double>[][] neuronErrors = null;

        // weight's updates
        private Vector<double>[][][] weightsUpdates = null;

        // threshold's updates
        private Vector<double>[][] thresholdsUpdates = null;

        public new double LearningRate
        {
            get { return learningRate; }
            set
            {
                learningRate = Math.Max(0.0, Math.Min(1.0, value));
            }
        }

        public new double Momentum
        {
            get { return momentum; }
            set
            {
                momentum = Math.Max(0.0, Math.Min(1.0, value));
            }
        }

        public SimdBackPropagationLearning(ActivationNetwork network) : base(network)
        {
            this.network = network;

            // create error and deltas arrays
            neuronErrors = new Vector<double>[network.Layers.Length][];
            weightsUpdates = new Vector<double>[network.Layers.Length][][];
            thresholdsUpdates = new Vector<double>[network.Layers.Length][];

            // initialize errors and deltas arrays for each layer
            for (int i = 0; i < network.Layers.Length; i++)
            {
                Layer layer = network.Layers[i];

                neuronErrors[i] = new Vector<double>[layer.Neurons.Length / 4];
                weightsUpdates[i] = new Vector<double>[layer.Neurons.Length][];
                thresholdsUpdates[i] = new Vector<double>[layer.Neurons.Length / 4];

                // for each neuron
                for (int j = 0; j < weightsUpdates[i].Length; j++)
                {
                    weightsUpdates[i][j] = new Vector<double>[layer.InputsCount / 4];
                }
            }
        }

        public new double Run(double[] input, double[] output)
        {
            // compute the network's output
            network.Compute(input);

            // calculate network error
            double error = CalculateError(output);

            // calculate weights updates
            CalculateUpdates(input);

            // update the network
            UpdateNetwork();

            return error;

        }

        public new double RunEpoch(double[][] input, double[][] output)
        {
            double error = 0.0;

            // run learning procedure for all samples
            for (int i = 0; i < input.Length; i++)
            {
                error += Run(input[i], output[i]);
            }

            // return summary error
            return error;
        }

        private double CalculateError(double[] desiredOutput)
        {
            // current and the next layers
            Layer layer, layerNext;
            // current and the next errors arrays
            Vector<double>[] errors, errorsNext;
            // error values
            double error = 0;
            // layers count
            int layersCount = network.Layers.Length;

            // vecrorize output
            Vector<double>[] desiredOutputVector = new Vector<double>[desiredOutput.Length / 4];
            for (int i = 0; i < desiredOutputVector.Length; i++)
            {
                desiredOutputVector[i] = new Vector<double>(desiredOutput, i * 4);
            }

            // assume, that all neurons of the network have the same activation function
            IActivationFunction function = (network.Layers[0].Neurons[0] as ActivationNeuron).ActivationFunction;

            // calculate error values for the last layer first
            layer = network.Layers[layersCount - 1];
            errors = neuronErrors[layersCount - 1];
            int outputLoopCnt = layer.Neurons.Length / 4;
            Vector<double>[] errorWork = new Vector<double>[outputLoopCnt];
            Parallel.For(0, outputLoopCnt, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
            {
                // neuron's output value
                double[] vectorInitTemp = new double[4];
                vectorInitTemp[0] = layer.Neurons[i * 4 + 0].Output;
                vectorInitTemp[1] = layer.Neurons[i * 4 + 1].Output;
                vectorInitTemp[2] = layer.Neurons[i * 4 + 2].Output;
                vectorInitTemp[3] = layer.Neurons[i * 4 + 3].Output;
                Vector<double> output = new Vector<double>(vectorInitTemp);

                // error of the neuron
                Vector<double> e = desiredOutputVector[i] - output;

                // error multiplied with activation function's derivative
                vectorInitTemp[0] = function.Derivative2(output[0]);
                vectorInitTemp[1] = function.Derivative2(output[1]);
                vectorInitTemp[2] = function.Derivative2(output[2]);
                vectorInitTemp[3] = function.Derivative2(output[3]);
                Vector<double> derivative = new Vector<double>(vectorInitTemp);
                errors[i] = e * derivative;

                // squre the error and sum it
                errorWork[i] = (e * e);
            });

            // エラー積算値の算出
            Vector<double> errorTemp = Vector<double>.Zero;
            for (int i = 0;i < outputLoopCnt;i++)
            {
                errorTemp += errorWork[i];
            }
            error = errorTemp[0] + errorTemp[1] + errorTemp[2] + errorTemp[3];

            // calculate error values for other layers
            for (int j = layersCount - 2; j >= 0; j--)
            {
                layer = network.Layers[j];
                layerNext = network.Layers[j + 1];
                errors = neuronErrors[j];
                errorsNext = neuronErrors[j + 1];

                // for all neurons of the layer
                int nextNyuronsLengthTemp = layerNext.Neurons.Length / 4;
                Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
                {
                    double[] vectorInitTemp = new double[4];
                    Vector<double> sum = Vector<double>.Zero;

                    // for all neurons of the next layer
                    for (int k = 0; k < nextNyuronsLengthTemp; k++)
                    {
                        for (int l = 0; l < 4; l++)
                        {
                            vectorInitTemp[0] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 0];
                            vectorInitTemp[1] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 1];
                            vectorInitTemp[2] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 2];
                            vectorInitTemp[3] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 3];
                            Vector<double> weightsTemp = new Vector<double>(vectorInitTemp);
                            sum += errorsNext[k] * weightsTemp;
                        }
                    }

                    vectorInitTemp[0] = function.Derivative2(layer.Neurons[i * 4 + 0].Output);
                    vectorInitTemp[1] = function.Derivative2(layer.Neurons[i * 4 + 1].Output);
                    vectorInitTemp[2] = function.Derivative2(layer.Neurons[i * 4 + 2].Output);
                    vectorInitTemp[3] = function.Derivative2(layer.Neurons[i * 4 + 3].Output);
                    Vector<double> derivative = new Vector<double>(vectorInitTemp);

                    errors[i] = sum * derivative;
                });
            }

            // return squared error of the last layer divided by 2
            return error / 2.0;
        }

        private void CalculateUpdates(double[] input)
        {
            // current and previous layers
            Layer layer, layerPrev;

            // layer's weights updates
            Vector<double>[][] layerWeightsUpdates;

            // layer's thresholds updates
            Vector<double>[] layerThresholdUpdates;

            // layer's error
            Vector<double>[] errors;

            // vecrorize input
            Vector<double>[] inputVector = new Vector<double>[input.Length / 4];
            Parallel.For(0, inputVector.Length, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
            {
                inputVector[i] = new Vector<double>(input, i * 4);
            });

            // 1 - calculate updates for the first layer
            layer = network.Layers[0];
            errors = neuronErrors[0];
            layerWeightsUpdates = weightsUpdates[0];
            layerThresholdUpdates = thresholdsUpdates[0];

            // cache for frequently used values
            //double cachedMomentum = learningRate * momentum;
            //double cached1mMomentum = learningRate * (1 - momentum);
            Vector<double> cachedMomentum = Vector.Multiply(Vector<double>.One, learningRate * momentum);
            Vector<double> cached1mMomentum = Vector.Multiply(Vector<double>.One, learningRate * (1 - momentum));

            // for each neuron of the layer
            Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
            {
                Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]);
                Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
                neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0];
                neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1];
                neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2];
                neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3];

                // for each weight of the neuron
                int neuronWeightUpdatesTemp = neuronWeightUpdates[0].Length;
                for (int j = 0; j < neuronWeightUpdatesTemp; j++)
                {
                    // calculate weight update
                    for (int k = 0;k < 4;k++)
                    {
                        neuronWeightUpdates[k][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[k][j]) + Vector.Multiply(cachedError[k], inputVector[j]);
                    }
                }

                // calculate treshold update
                layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError;
            });


            // 2 - for all other layers
            int layersLengthTemp = network.Layers.Length;
            for (int k = 1; k < layersLengthTemp; k++)
            {
                layerPrev = network.Layers[k - 1];
                layer = network.Layers[k];
                errors = neuronErrors[k];
                layerWeightsUpdates = weightsUpdates[k];
                layerThresholdUpdates = thresholdsUpdates[k];

                // for each neuron of the layer
                int neuronWeightUpdatesTemp = layerWeightsUpdates[0].Length;
                Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
                {
                    double[] vectorInitTemp = new double[4];
                    Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]);
                    Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
                    neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0];
                    neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1];
                    neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2];
                    neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3];

                    // for each synapse of the neuron
                    for (int j = 0; j < neuronWeightUpdatesTemp; j++)
                    {
                        // calculate weight update
                        vectorInitTemp[0] = layerPrev.Neurons[j * 4 + 0].Output;
                        vectorInitTemp[1] = layerPrev.Neurons[j * 4 + 1].Output;
                        vectorInitTemp[2] = layerPrev.Neurons[j * 4 + 2].Output;
                        vectorInitTemp[3] = layerPrev.Neurons[j * 4 + 3].Output;
                        Vector<double> neuronsOutput = new Vector<double>(vectorInitTemp);
                        for (int l = 0; l < 4; l++)
                        {
                            neuronWeightUpdates[l][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[l][j]) + Vector.Multiply(cachedError[l], neuronsOutput);
                        }
                    }

                    // calculate treshold update
                    layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError;
                });
            }
        }

        private void UpdateNetwork()
        {
            // current layer
            Layer layer;
            // layer's weights updates
            Vector<double>[][] layerWeightsUpdates;
            // layer's thresholds updates
            Vector<double>[] layerThresholdUpdates;

            // for each layer of the network
            int layersLengthTemp = network.Layers.Length;
            for (int i = 0; i < layersLengthTemp; i++)
            {
                layer = network.Layers[i];
                layerWeightsUpdates = weightsUpdates[i];
                layerThresholdUpdates = thresholdsUpdates[i];

                // 誘導変数の使用
                int weightsLengthTemp = layer.Neurons[0].Weights.Length / 4;

                // for each neuron of the layer
                Parallel.For(0, (layer.Neurons.Length / 4), j =>
                {
                    ActivationNeuron[] neuron = new ActivationNeuron[4];
                    neuron[0] = layer.Neurons[j * 4 + 0] as ActivationNeuron;
                    neuron[1] = layer.Neurons[j * 4 + 1] as ActivationNeuron;
                    neuron[2] = layer.Neurons[j * 4 + 2] as ActivationNeuron;
                    neuron[3] = layer.Neurons[j * 4 + 3] as ActivationNeuron;

                    Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
                    neuronWeightUpdates[0] = layerWeightsUpdates[j * 4 + 0];
                    neuronWeightUpdates[1] = layerWeightsUpdates[j * 4 + 1];
                    neuronWeightUpdates[2] = layerWeightsUpdates[j * 4 + 2];
                    neuronWeightUpdates[3] = layerWeightsUpdates[j * 4 + 3];

                    // for each weight of the neuron
                    for (int k = 0; k < weightsLengthTemp; k++)
                    {
                        for (int l = 0; l < 4; l++)
                        {
                            // update weight
                            neuron[l].Weights[k * 4 + 0] += neuronWeightUpdates[l][k][0];
                            neuron[l].Weights[k * 4 + 1] += neuronWeightUpdates[l][k][1];
                            neuron[l].Weights[k * 4 + 2] += neuronWeightUpdates[l][k][2];
                            neuron[l].Weights[k * 4 + 3] += neuronWeightUpdates[l][k][3];
                        }
                    }

                    // update treshold
                    neuron[0].Threshold += layerThresholdUpdates[j][0];
                    neuron[1].Threshold += layerThresholdUpdates[j][1];
                    neuron[2].Threshold += layerThresholdUpdates[j][2];
                    neuron[3].Threshold += layerThresholdUpdates[j][3];
                });
            }
        }
    }
}

Download:SimdBackPropagationLearning

コメントを残す

メールアドレスが公開されることはありません。 が付いている欄は必須項目です