Accord.NET(AForge.NET)のBackPropagationLearningをSIMDを使用するように修正してみた。
速度的には以前に作ったマルチスレッド対応版のBackPropagationLearningの1.2倍程度の早さになってます。
簡略版なので並列数は4に固定。したがってAVX非対応のCPUだと動きません。
並列数2だと、おそらく速度的に変わらないので作るつもりもありません。
DeepBeliefNetworkなど関連するクラスも合わせて修正すればもっと早くなるのだろうけど、とりあえずBackPropagationLearningのみの対応。流石に全部はボリューム多すぎるし・・・orz
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Threading;
using System.Numerics;
using System.Diagnostics;
using AForge.Neuro;
using AForge.Neuro.Learning;
namespace LernDetectImage
{
class SimdBackPropagationLearning : BackPropagationLearning
{
// network to teach
private ActivationNetwork network;
// learning rate
private double learningRate = 0.1;
// momentum
private double momentum = 0.0;
// neuron's errors
private Vector<double>[][] neuronErrors = null;
// weight's updates
private Vector<double>[][][] weightsUpdates = null;
// threshold's updates
private Vector<double>[][] thresholdsUpdates = null;
public new double LearningRate
{
get { return learningRate; }
set
{
learningRate = Math.Max(0.0, Math.Min(1.0, value));
}
}
public new double Momentum
{
get { return momentum; }
set
{
momentum = Math.Max(0.0, Math.Min(1.0, value));
}
}
public SimdBackPropagationLearning(ActivationNetwork network) : base(network)
{
this.network = network;
// create error and deltas arrays
neuronErrors = new Vector<double>[network.Layers.Length][];
weightsUpdates = new Vector<double>[network.Layers.Length][][];
thresholdsUpdates = new Vector<double>[network.Layers.Length][];
// initialize errors and deltas arrays for each layer
for (int i = 0; i < network.Layers.Length; i++)
{
Layer layer = network.Layers[i];
neuronErrors[i] = new Vector<double>[layer.Neurons.Length / 4];
weightsUpdates[i] = new Vector<double>[layer.Neurons.Length][];
thresholdsUpdates[i] = new Vector<double>[layer.Neurons.Length / 4];
// for each neuron
for (int j = 0; j < weightsUpdates[i].Length; j++)
{
weightsUpdates[i][j] = new Vector<double>[layer.InputsCount / 4];
}
}
}
public new double Run(double[] input, double[] output)
{
// compute the network's output
network.Compute(input);
// calculate network error
double error = CalculateError(output);
// calculate weights updates
CalculateUpdates(input);
// update the network
UpdateNetwork();
return error;
}
public new double RunEpoch(double[][] input, double[][] output)
{
double error = 0.0;
// run learning procedure for all samples
for (int i = 0; i < input.Length; i++)
{
error += Run(input[i], output[i]);
}
// return summary error
return error;
}
private double CalculateError(double[] desiredOutput)
{
// current and the next layers
Layer layer, layerNext;
// current and the next errors arrays
Vector<double>[] errors, errorsNext;
// error values
double error = 0;
// layers count
int layersCount = network.Layers.Length;
// vecrorize output
Vector<double>[] desiredOutputVector = new Vector<double>[desiredOutput.Length / 4];
for (int i = 0; i < desiredOutputVector.Length; i++)
{
desiredOutputVector[i] = new Vector<double>(desiredOutput, i * 4);
}
// assume, that all neurons of the network have the same activation function
IActivationFunction function = (network.Layers[0].Neurons[0] as ActivationNeuron).ActivationFunction;
// calculate error values for the last layer first
layer = network.Layers[layersCount - 1];
errors = neuronErrors[layersCount - 1];
int outputLoopCnt = layer.Neurons.Length / 4;
Vector<double>[] errorWork = new Vector<double>[outputLoopCnt];
Parallel.For(0, outputLoopCnt, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
{
// neuron's output value
double[] vectorInitTemp = new double[4];
vectorInitTemp[0] = layer.Neurons[i * 4 + 0].Output;
vectorInitTemp[1] = layer.Neurons[i * 4 + 1].Output;
vectorInitTemp[2] = layer.Neurons[i * 4 + 2].Output;
vectorInitTemp[3] = layer.Neurons[i * 4 + 3].Output;
Vector<double> output = new Vector<double>(vectorInitTemp);
// error of the neuron
Vector<double> e = desiredOutputVector[i] - output;
// error multiplied with activation function's derivative
vectorInitTemp[0] = function.Derivative2(output[0]);
vectorInitTemp[1] = function.Derivative2(output[1]);
vectorInitTemp[2] = function.Derivative2(output[2]);
vectorInitTemp[3] = function.Derivative2(output[3]);
Vector<double> derivative = new Vector<double>(vectorInitTemp);
errors[i] = e * derivative;
// squre the error and sum it
errorWork[i] = (e * e);
});
// エラー積算値の算出
Vector<double> errorTemp = Vector<double>.Zero;
for (int i = 0;i < outputLoopCnt;i++)
{
errorTemp += errorWork[i];
}
error = errorTemp[0] + errorTemp[1] + errorTemp[2] + errorTemp[3];
// calculate error values for other layers
for (int j = layersCount - 2; j >= 0; j--)
{
layer = network.Layers[j];
layerNext = network.Layers[j + 1];
errors = neuronErrors[j];
errorsNext = neuronErrors[j + 1];
// for all neurons of the layer
int nextNyuronsLengthTemp = layerNext.Neurons.Length / 4;
Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
{
double[] vectorInitTemp = new double[4];
Vector<double> sum = Vector<double>.Zero;
// for all neurons of the next layer
for (int k = 0; k < nextNyuronsLengthTemp; k++)
{
for (int l = 0; l < 4; l++)
{
vectorInitTemp[0] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 0];
vectorInitTemp[1] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 1];
vectorInitTemp[2] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 2];
vectorInitTemp[3] = layerNext.Neurons[k * 4 + l].Weights[i * 4 + 3];
Vector<double> weightsTemp = new Vector<double>(vectorInitTemp);
sum += errorsNext[k] * weightsTemp;
}
}
vectorInitTemp[0] = function.Derivative2(layer.Neurons[i * 4 + 0].Output);
vectorInitTemp[1] = function.Derivative2(layer.Neurons[i * 4 + 1].Output);
vectorInitTemp[2] = function.Derivative2(layer.Neurons[i * 4 + 2].Output);
vectorInitTemp[3] = function.Derivative2(layer.Neurons[i * 4 + 3].Output);
Vector<double> derivative = new Vector<double>(vectorInitTemp);
errors[i] = sum * derivative;
});
}
// return squared error of the last layer divided by 2
return error / 2.0;
}
private void CalculateUpdates(double[] input)
{
// current and previous layers
Layer layer, layerPrev;
// layer's weights updates
Vector<double>[][] layerWeightsUpdates;
// layer's thresholds updates
Vector<double>[] layerThresholdUpdates;
// layer's error
Vector<double>[] errors;
// vecrorize input
Vector<double>[] inputVector = new Vector<double>[input.Length / 4];
Parallel.For(0, inputVector.Length, new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
{
inputVector[i] = new Vector<double>(input, i * 4);
});
// 1 - calculate updates for the first layer
layer = network.Layers[0];
errors = neuronErrors[0];
layerWeightsUpdates = weightsUpdates[0];
layerThresholdUpdates = thresholdsUpdates[0];
// cache for frequently used values
//double cachedMomentum = learningRate * momentum;
//double cached1mMomentum = learningRate * (1 - momentum);
Vector<double> cachedMomentum = Vector.Multiply(Vector<double>.One, learningRate * momentum);
Vector<double> cached1mMomentum = Vector.Multiply(Vector<double>.One, learningRate * (1 - momentum));
// for each neuron of the layer
Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
{
Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]);
Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0];
neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1];
neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2];
neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3];
// for each weight of the neuron
int neuronWeightUpdatesTemp = neuronWeightUpdates[0].Length;
for (int j = 0; j < neuronWeightUpdatesTemp; j++)
{
// calculate weight update
for (int k = 0;k < 4;k++)
{
neuronWeightUpdates[k][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[k][j]) + Vector.Multiply(cachedError[k], inputVector[j]);
}
}
// calculate treshold update
layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError;
});
// 2 - for all other layers
int layersLengthTemp = network.Layers.Length;
for (int k = 1; k < layersLengthTemp; k++)
{
layerPrev = network.Layers[k - 1];
layer = network.Layers[k];
errors = neuronErrors[k];
layerWeightsUpdates = weightsUpdates[k];
layerThresholdUpdates = thresholdsUpdates[k];
// for each neuron of the layer
int neuronWeightUpdatesTemp = layerWeightsUpdates[0].Length;
Parallel.For(0, (layer.Neurons.Length / 4), new ParallelOptions { MaxDegreeOfParallelism = 16 }, i =>
{
double[] vectorInitTemp = new double[4];
Vector<double> cachedError = Vector.Multiply(cached1mMomentum, errors[i]);
Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
neuronWeightUpdates[0] = layerWeightsUpdates[i * 4 + 0];
neuronWeightUpdates[1] = layerWeightsUpdates[i * 4 + 1];
neuronWeightUpdates[2] = layerWeightsUpdates[i * 4 + 2];
neuronWeightUpdates[3] = layerWeightsUpdates[i * 4 + 3];
// for each synapse of the neuron
for (int j = 0; j < neuronWeightUpdatesTemp; j++)
{
// calculate weight update
vectorInitTemp[0] = layerPrev.Neurons[j * 4 + 0].Output;
vectorInitTemp[1] = layerPrev.Neurons[j * 4 + 1].Output;
vectorInitTemp[2] = layerPrev.Neurons[j * 4 + 2].Output;
vectorInitTemp[3] = layerPrev.Neurons[j * 4 + 3].Output;
Vector<double> neuronsOutput = new Vector<double>(vectorInitTemp);
for (int l = 0; l < 4; l++)
{
neuronWeightUpdates[l][j] = Vector.Multiply(cachedMomentum, neuronWeightUpdates[l][j]) + Vector.Multiply(cachedError[l], neuronsOutput);
}
}
// calculate treshold update
layerThresholdUpdates[i] = Vector.Multiply(cachedMomentum, layerThresholdUpdates[i]) + cachedError;
});
}
}
private void UpdateNetwork()
{
// current layer
Layer layer;
// layer's weights updates
Vector<double>[][] layerWeightsUpdates;
// layer's thresholds updates
Vector<double>[] layerThresholdUpdates;
// for each layer of the network
int layersLengthTemp = network.Layers.Length;
for (int i = 0; i < layersLengthTemp; i++)
{
layer = network.Layers[i];
layerWeightsUpdates = weightsUpdates[i];
layerThresholdUpdates = thresholdsUpdates[i];
// 誘導変数の使用
int weightsLengthTemp = layer.Neurons[0].Weights.Length / 4;
// for each neuron of the layer
Parallel.For(0, (layer.Neurons.Length / 4), j =>
{
ActivationNeuron[] neuron = new ActivationNeuron[4];
neuron[0] = layer.Neurons[j * 4 + 0] as ActivationNeuron;
neuron[1] = layer.Neurons[j * 4 + 1] as ActivationNeuron;
neuron[2] = layer.Neurons[j * 4 + 2] as ActivationNeuron;
neuron[3] = layer.Neurons[j * 4 + 3] as ActivationNeuron;
Vector<double>[][] neuronWeightUpdates = new Vector<double>[4][];
neuronWeightUpdates[0] = layerWeightsUpdates[j * 4 + 0];
neuronWeightUpdates[1] = layerWeightsUpdates[j * 4 + 1];
neuronWeightUpdates[2] = layerWeightsUpdates[j * 4 + 2];
neuronWeightUpdates[3] = layerWeightsUpdates[j * 4 + 3];
// for each weight of the neuron
for (int k = 0; k < weightsLengthTemp; k++)
{
for (int l = 0; l < 4; l++)
{
// update weight
neuron[l].Weights[k * 4 + 0] += neuronWeightUpdates[l][k][0];
neuron[l].Weights[k * 4 + 1] += neuronWeightUpdates[l][k][1];
neuron[l].Weights[k * 4 + 2] += neuronWeightUpdates[l][k][2];
neuron[l].Weights[k * 4 + 3] += neuronWeightUpdates[l][k][3];
}
}
// update treshold
neuron[0].Threshold += layerThresholdUpdates[j][0];
neuron[1].Threshold += layerThresholdUpdates[j][1];
neuron[2].Threshold += layerThresholdUpdates[j][2];
neuron[3].Threshold += layerThresholdUpdates[j][3];
});
}
}
}
}
Download:SimdBackPropagationLearning








