Accord.NET(AForge.NET)のBackPropagationLearningをSIMDを使用するように修正してみた。
速度的には以前に作ったマルチスレッド対応版のBackPropagationLearningの1.2倍程度の早さになってます。
簡略版なので並列数は4に固定。したがってAVX非対応のCPUだと動きません。
並列数2だと、おそらく速度的に変わらないので作るつもりもありません。
DeepBeliefNetworkなど関連するクラスも合わせて修正すればもっと早くなるのだろうけど、とりあえずBackPropagationLearningのみの対応。流石に全部はボリューム多すぎるし・・・orz
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Threading; using System.Numerics; using System.Diagnostics; using AForge.Neuro; using AForge.Neuro.Learning; namespace LernDetectImage { class SimdBackPropagationLearning : BackPropagationLearning { // network to teach private ActivationNetwork network; // learning rate private double learningRate = 0.1; // momentum private double momentum = 0.0; // neuron's errors private Vector<double>[][] neuronErrors = null; // weight's updates private Vector<double>[][][] weightsUpdates = null; // threshold's updates private Vector<double>[][] thresholdsUpdates = null; public new double LearningRate { get { return learningRate; } set { learningRate = Math.Max(0.0, Math.Min(1.0, value)); } } public new double Momentum { get { return momentum; } set { momentum = Math.Max(0.0, Math.Min(1.0, value)); } } public SimdBackPropagationLearning(ActivationNetwork network) : base(network) { this.network = network; // create error and deltas arrays neuronErrors = new Vector<double>[network.Layers.Length][]; weightsUpdates = new Vector<double>[network.Layers.Length][][]; thresholdsUpdates = new Vector<double>[network.Layers.Length][]; // initialize errors and deltas arrays for each layer for (int i = 0; i < network.Layers.Length; i++) { Layer layer = network.Layers[i]; neuronErrors[i] = new Vector<double>[layer.Neurons.Length / 4]; weightsUpdates[i] = new Vector<double>[layer.Neurons.Length][]; thresholdsUpdates[i] = new Vector<double>[layer.Neurons.Length / 4]; // for each neuron for (int j = 0; j < weightsUpdates[i].Length; j++) { weightsUpdates[i][j] = new Vector<double>[layer.InputsCount / 4]; } } } public new double Run(double[] input, double[] output) { // compute the network's output network.Compute(input); // calculate network error double error = CalculateError(output); // calculate weights updates CalculateUpdates(input); // update the network UpdateNetwork(); return error; } public new double RunEpoch(double[][] input, double[][] output) { double error = 0.0; // run learning procedure for all samples for (int i = 0; i < input.Length; i++) { error += Run(input[i], output[i]); } // return summary error return error; } private double CalculateError(double[] desiredOutput) { // current and the next layers Layer layer, layerNext; // current and the next errors arrays Vector<double>[] errors, errorsNext; // error values double error = 0; // layers count int layersCount = network.Layers.Length; // vecrorize output Vector<double>[] desiredOutputVector = new Vector<double>[desiredOutput.Length / 4]; for (int i = 0; i < desiredOutputVector.Length; i++) { desiredOutputVector[i] = new Vector<double>(desiredOutput, i * 4); } // assume, that all neurons of the network have the same activation function IActivationFunction function = (network.Layers[0].Neurons[0] as ActivationNeuron).ActivationFunction; // calculate error values for the last layer first layer = network.Layers[layersCount - 1]; errors = neuronErrors[layersCount - 1]; int outputLoopCnt = layer.Neurons.Length / 4; Vector<double>[] errorWork = new Vector<double>[outputLoopCnt]; Parallel.For(0, outputLoopCnt, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i => { // neuron's output value double[] vectorInitTemp = new double[4]; vectorInitTemp[0] = layer.Neurons[i * 4 + 0].Output; vectorInitTemp[1] = layer.Neurons[i * 4 + 1].Output; vectorInitTemp[2] = layer.Neurons[i * 4 + 2].Output; vectorInitTemp[3] = layer.Neurons[i * 4 + 3].Output; Vector<double> output = new Vector<double>(vectorInitTemp); // error of the neuron Vector<double> e = desiredOutputVector[i] - output; // error multiplied with activation function's derivative vectorInitTemp[0] = function.Derivative2(output[0]); vectorInitTemp[1] = function.Derivative2(output[1]); vectorInitTemp[2] = function.Derivative2(output[2]); vectorInitTemp[3] = function.Derivative2(output[3]); Vector<double> derivative = new Vector<double>(vectorInitTemp); errors[i] = e * derivative; // squre the error and sum it errorWork[i] = (e * e); }); // エラー積算値の算出 Vector<double> errorTemp = Vector<double>.Zero; for (int i = 0;i < outputLoopCnt;i++) { errorTemp += errorWork[i]; } error = errorTemp[0] + errorTemp[1] + errorTemp[2] + errorTemp[3]; // calculate error values for other layers for (int j = layersCount - 2; j >= 0; j--) { layer = network.Layers[j]; layerNext = network.Layers[j + 1]; errors = neuronErrors[j]; errorsNext = neuronErrors[j + 1]; // for all neurons of the layer int nextNyuronsLengthTemp = layerNext.Neurons.Length / 4; Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i => { double[] vectorInitTemp = new double[4]; Vector<double> sum = Vector<double>.Zero; // for all neurons of the next layer for (int k = 0; k < nextNyuronsLengthTemp; k++) { for (int l = 0; l < 4; l++) { vectorInitTemp[0] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 0]; vectorInitTemp[1] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 1]; vectorInitTemp[2] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 2]; vectorInitTemp[3] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 3]; Vector<double> weightsTemp = new Vector<double>(vectorInitTemp); sum += errorsNext[k] * weightsTemp; } } vectorInitTemp[0] = function.Derivative2(layer.Neurons[i * 4 + 0].Output); vectorInitTemp[1] = function.Derivative2(layer.Neurons[i * 4 + 1].Output); vectorInitTemp[2] = function.Derivative2(layer.Neurons[i * 4 + 2].Output); vectorInitTemp[3] = function.Derivative2(layer.Neurons[i * 4 + 3].Output); Vector<double> derivative = new Vector<double>(vectorInitTemp); errors[i] = sum * derivative; }); } // return squared error of the last layer divided by 2 return error / 2.0; } private void CalculateUpdates(double[] input) { // current and previous layers Layer layer, layerPrev; // layer's weights updates Vector<double>[][] layerWeightsUpdates; // layer's thresholds updates Vector<double>[] layerThresholdUpdates; // layer's error Vector<double>[] errors; // vecrorize input Vector<double>[] inputVector = new Vector<double>[input.Length / 4]; Parallel.For(0, inputVector.Length, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i => { inputVector[i] = new Vector<double>(input, i * 4); }); // 1 - calculate updates for the first layer layer = network.Layers[0]; errors = neuronErrors[0]; layerWeightsUpdates = weightsUpdates[0]; layerThresholdUpdates = thresholdsUpdates[0]; // cache for frequently used values //double cachedMomentum = learningRate * momentum; //double cached1mMomentum = learningRate * (1 - momentum); Vector<double> cachedMomentum = Vector.Multiply(Vector<double>.One, learningRate * momentum); Vector<double> cached1mMomentum = Vector.Multiply(Vector<double>.One, learningRate * (1 - momentum)); // for each neuron of the layer Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i => { Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]); Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][]; neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0]; neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1]; neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2]; neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3]; // for each weight of the neuron int neuronWeightUpdatesTemp = neuronWeightUpdates[0].Length; for (int j = 0; j < neuronWeightUpdatesTemp; j++) { // calculate weight update for (int k = 0;k < 4;k++) { neuronWeightUpdates[k][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[k][j]) + Vector.Multiply(cachedError[k], inputVector[j]); } } // calculate treshold update layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError; }); // 2 - for all other layers int layersLengthTemp = network.Layers.Length; for (int k = 1; k < layersLengthTemp; k++) { layerPrev = network.Layers[k - 1]; layer = network.Layers[k]; errors = neuronErrors[k]; layerWeightsUpdates = weightsUpdates[k]; layerThresholdUpdates = thresholdsUpdates[k]; // for each neuron of the layer int neuronWeightUpdatesTemp = layerWeightsUpdates[0].Length; Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i => { double[] vectorInitTemp = new double[4]; Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]); Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][]; neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0]; neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1]; neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2]; neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3]; // for each synapse of the neuron for (int j = 0; j < neuronWeightUpdatesTemp; j++) { // calculate weight update vectorInitTemp[0] = layerPrev.Neurons[j * 4 + 0].Output; vectorInitTemp[1] = layerPrev.Neurons[j * 4 + 1].Output; vectorInitTemp[2] = layerPrev.Neurons[j * 4 + 2].Output; vectorInitTemp[3] = layerPrev.Neurons[j * 4 + 3].Output; Vector<double> neuronsOutput = new Vector<double>(vectorInitTemp); for (int l = 0; l < 4; l++) { neuronWeightUpdates[l][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[l][j]) + Vector.Multiply(cachedError[l], neuronsOutput); } } // calculate treshold update layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError; }); } } private void UpdateNetwork() { // current layer Layer layer; // layer's weights updates Vector<double>[][] layerWeightsUpdates; // layer's thresholds updates Vector<double>[] layerThresholdUpdates; // for each layer of the network int layersLengthTemp = network.Layers.Length; for (int i = 0; i < layersLengthTemp; i++) { layer = network.Layers[i]; layerWeightsUpdates = weightsUpdates[i]; layerThresholdUpdates = thresholdsUpdates[i]; // 誘導変数の使用 int weightsLengthTemp = layer.Neurons[0].Weights.Length / 4; // for each neuron of the layer Parallel.For(0, (layer.Neurons.Length / 4), j => { ActivationNeuron[] neuron = new ActivationNeuron[4]; neuron[0] = layer.Neurons[j * 4 + 0] as ActivationNeuron; neuron[1] = layer.Neurons[j * 4 + 1] as ActivationNeuron; neuron[2] = layer.Neurons[j * 4 + 2] as ActivationNeuron; neuron[3] = layer.Neurons[j * 4 + 3] as ActivationNeuron; Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][]; neuronWeightUpdates[0] = layerWeightsUpdates[j * 4 + 0]; neuronWeightUpdates[1] = layerWeightsUpdates[j * 4 + 1]; neuronWeightUpdates[2] = layerWeightsUpdates[j * 4 + 2]; neuronWeightUpdates[3] = layerWeightsUpdates[j * 4 + 3]; // for each weight of the neuron for (int k = 0; k < weightsLengthTemp; k++) { for (int l = 0; l < 4; l++) { // update weight neuron[l].Weights[k * 4 + 0] += neuronWeightUpdates[l][k][0]; neuron[l].Weights[k * 4 + 1] += neuronWeightUpdates[l][k][1]; neuron[l].Weights[k * 4 + 2] += neuronWeightUpdates[l][k][2]; neuron[l].Weights[k * 4 + 3] += neuronWeightUpdates[l][k][3]; } } // update treshold neuron[0].Threshold += layerThresholdUpdates[j][0]; neuron[1].Threshold += layerThresholdUpdates[j][1]; neuron[2].Threshold += layerThresholdUpdates[j][2]; neuron[3].Threshold += layerThresholdUpdates[j][3]; }); } } } }
Download:SimdBackPropagationLearning