1325 lines
114 KiB
Common Lisp
1325 lines
114 KiB
Common Lisp
//+------------------------------------------------------------------+
|
|
//| opencl_program.cl |
|
|
//| Copyright 2021, MetaQuotes Ltd. |
|
|
//| https://www.mql5.com |
|
|
//+------------------------------------------------------------------+
|
|
//--- by default some GPU doesn't support TYPEs
|
|
//--- cl_khr_fp64 directive is used to enable work with TYPEs
|
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
//+------------------------------------------------------------------+
|
|
//| Activation functions |
|
|
//+------------------------------------------------------------------+
|
|
//| Linear activation function |
|
|
//| Parameter 'value' Weighted sum of initial data |
|
|
//| 'a' defines the angle of inclination of the line |
|
|
//| 'b' - the vertical offset of the line |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LineActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
const TYPE a, const TYPE b)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
outputs[i] = (a * inputs[i] + b);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Sigmoid activation function |
|
|
//| Parameter 'value' Weighted sum of initial data |
|
|
//| 'a' stretches the range of values of the function |
|
|
//| from '0' to 'a' |
|
|
//| 'b' shifts the resulting value |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SigmoidActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
const TYPE a, const TYPE b)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
outputs[i] = a / (1 + exp(-inputs[i])) - b;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Derivative of Sigmoid activation function |
|
|
//| Parameter 'value' current point (result of feed forward). |
|
|
//| 'a' stretches the range of values of the function from |
|
|
//| '0' to 'a' |
|
|
//| 'b' shifts the resulting value |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SigmoidDerivative(__global TYPE* outputs,
|
|
__global TYPE* output_gr,
|
|
__global TYPE* input_gr,
|
|
const TYPE a, const TYPE b
|
|
)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
if(a == 0)
|
|
input_gr[i] = 0;
|
|
else
|
|
{
|
|
TYPE z = clamp(outputs[i] + b, (TYPE)0, a);
|
|
input_gr[i] = z * (1 - z / a) * output_gr[i];
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| TANH activation function |
|
|
//| Parameter 'value' Weighted sum of initial data |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void TanhActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
outputs[i] = tanh(inputs[i]);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Derivative of TANH activation function |
|
|
//| Parameter 'value' current point (result of feed forward). |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void TanhDerivative(__global TYPE* outputs,
|
|
__global TYPE* output_gr,
|
|
__global TYPE* input_gr
|
|
)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
input_gr[i] = (1 - pow(outputs[i], 2)) * output_gr[i];
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| LReLU activation function |
|
|
//| Parameter 'value' current point (result of feed forward). |
|
|
//| 'a' leak parameter |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LReLUActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
const TYPE a)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
TYPE value = inputs[i];
|
|
outputs[i] = (value > 0 ? value : a * value);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Derivative of LReLU activation function |
|
|
//| Parameter 'value' current point (result of feed forward). |
|
|
//| 'a' leak parameter |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LReLUDerivative(__global TYPE* outputs,
|
|
__global TYPE* output_gr,
|
|
__global TYPE* input_gr,
|
|
const TYPE a)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
input_gr[i] = (outputs[i] > 0 ? (TYPE)1 : a) * output_gr[i];
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Swish activation function |
|
|
//| Parameter 'value' Weighted sum of initial data |
|
|
//| 'b' affects the nonlinearity of the function |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SwishActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
const TYPE b)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
TYPE value = inputs[i];
|
|
outputs[i] = value / (1 + exp(-b * value));
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Derivative of Swish activation function |
|
|
//| Parameter 'value' current point (result of feed forward). |
|
|
//| 'value_input' Weighted sum of initial data |
|
|
//| 'b' affects the nonlinearity of the function |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SwishDerivative(__global TYPE* outputs,
|
|
__global TYPE* output_gr,
|
|
__global TYPE* input_gr,
|
|
const TYPE b,
|
|
__global TYPE* inputs)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
TYPE by = b * outputs[i];
|
|
input_gr[i] = (by + (1 - by) / (1 + exp(-b * inputs[i]))) * output_gr[i];
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Transfer 4 elements of TYPE vector to TYPE4 |
|
|
//| Parameter 'array' source array of data |
|
|
//| 'start' first position to copy |
|
|
//| 'step' step between elements to copy |
|
|
//| 'size' Size of source array |
|
|
//| 'shift' Shift in source array to the 1-st copied element |
|
|
//+------------------------------------------------------------------+
|
|
TYPE4 ToVect4(__global TYPE *array, int start, int step, int size, int shift)
|
|
{
|
|
TYPE4 result = (TYPE4)(0, 0, 0, 0);
|
|
step = max(1, step);
|
|
int st = start * step + shift;
|
|
if(st < size)
|
|
{
|
|
int k = (size - shift + step - 1) / step;
|
|
switch(k)
|
|
{
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
result = (TYPE4)(array[st], 0, 0, 0);
|
|
break;
|
|
case 2:
|
|
result = (TYPE4)(array[st], array[st + step], 0, 0);
|
|
break;
|
|
case 3:
|
|
result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], 0);
|
|
break;
|
|
default:
|
|
result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], array[st + 3 * step]);
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
TYPE Max4(TYPE4 vect, TYPE value)
|
|
{
|
|
TYPE result = fmax(vect.s0, value);
|
|
result = fmax(vect.s1, result);
|
|
result = fmax(vect.s2, result);
|
|
return(fmax(vect.s3, result));
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Transfer TYPE4 to 4 elements of TYPE vector |
|
|
//| Parameter 'array' target array of data |
|
|
//| 'value' source TYPE4 vector |
|
|
//| 'start' first position to copy in target array |
|
|
//| 'step' step between elements in target array |
|
|
//| 'size' Size of target array |
|
|
//| 'shift' Shift in target array to the 1-st copied element |
|
|
//+------------------------------------------------------------------+
|
|
void D4ToArray(__global TYPE *array, TYPE4 value, int start, int step, int size, int shift)
|
|
{
|
|
step = max(1, step);
|
|
int st = start * step + shift;
|
|
if(st < size)
|
|
{
|
|
int k = (size - shift) % step;
|
|
k = (size - shift - k) / step - start + (k > 0 ? 1 : 0);
|
|
switch(k)
|
|
{
|
|
case 3:
|
|
array[st + 2 * step] = value.s2;
|
|
case 2:
|
|
array[st + step] = value.s1;
|
|
case 1:
|
|
array[st] = value.s0;
|
|
break;
|
|
default:
|
|
array[st + 3 * step] = value.s3;
|
|
array[st + 2 * step] = value.s2;
|
|
array[st + step] = value.s1;
|
|
array[st] = value.s0;
|
|
break;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Kernel Feed Forward of perceptron |
|
|
//| Parameter 'inputs' vector of inputs data |
|
|
//| 'weights' matrix of weights |
|
|
//| 'outputs' output data |
|
|
//| 'inputs_total' size of inputs vector |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void PerceptronFeedForward(__global TYPE *inputs,
|
|
__global TYPE *weights,
|
|
__global TYPE *outputs,
|
|
int inputs_total)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int weights_total = get_global_size(0) * (inputs_total + 1);
|
|
int shift = n * (inputs_total + 1);
|
|
TYPE s = weights[shift + inputs_total];
|
|
for(int i = 0; i < inputs_total; i += 4)
|
|
s += dot(ToVect4(inputs, i, 1, inputs_total, 0), ToVect4(weights, i, 1, weights_total, shift));
|
|
outputs[n] = s;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Kernel of calculation output gradients |
|
|
//| Parameter 'target' vector of target data |
|
|
//| 'outputs' vector of previous FF outputs data |
|
|
//| 'gradients' vector of gradients |
|
|
//| 'loss_function' type of loss function |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void CalcOutputGradient(__global TYPE *target,
|
|
__global TYPE *outputs,
|
|
__global TYPE *gradients,
|
|
int loss_function)
|
|
{
|
|
const int n = get_global_id(0);
|
|
switch(loss_function)
|
|
{
|
|
case 0:
|
|
gradients[n] = target[n] - outputs[n];
|
|
break;
|
|
case 1:
|
|
gradients[n] = 2 * (target[n] - outputs[n]);
|
|
break;
|
|
case 2:
|
|
gradients[n] = -target[n] / (outputs[n] + 1e-37f) * log(outputs[n] + 1e-37f);
|
|
break;
|
|
case 3:
|
|
gradients[n] = (target[n] - outputs[n]) / (outputs[n] * (outputs[n] - 1) + 1e-37f);
|
|
break;
|
|
default:
|
|
gradients[n] = target[n] - outputs[n];
|
|
break;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void CalcHiddenGradient(__global TYPE *gradient_inputs,
|
|
__global TYPE *weights,
|
|
__global TYPE *gradients,
|
|
int outputs_total)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int inputs_total = get_global_size(0);
|
|
int weights_total = (inputs_total + 1) * outputs_total;
|
|
//---
|
|
TYPE grad = 0;
|
|
for(int o = 0; o < outputs_total; o += 4)
|
|
grad += dot(ToVect4(gradients, o, 1, outputs_total, 0), ToVect4(weights, o, (inputs_total + 1), weights_total, n));
|
|
gradient_inputs[n] = grad;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void CalcDeltaWeights(__global TYPE *inputs,
|
|
__global TYPE *delta_weights,
|
|
__global TYPE *gradients)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int outputs_total = get_global_size(0);
|
|
const int i = get_global_id(1);
|
|
const int inputs_total = get_global_size(1);
|
|
//---
|
|
TYPE grad = gradients[n];
|
|
int shift = n * (inputs_total + 1);
|
|
delta_weights[shift + i] = inputs[i] * grad + delta_weights[shift + i];
|
|
if(i == 0)
|
|
delta_weights[shift + inputs_total] += grad;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SGDUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
int total,
|
|
int batch_size,
|
|
TYPE learningRate,
|
|
TYPE Lambda1,
|
|
TYPE Lambda2
|
|
)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0);
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE lr = learningRate / ((TYPE)batch_size);
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
weights4 += (TYPE4)(lr) * delta4;
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void MomentumUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
__global TYPE *momentum,
|
|
int total, int batch_size,
|
|
TYPE learningRate,
|
|
TYPE beta,
|
|
TYPE Lambda1, TYPE Lambda2)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
//---
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
momentum4 = (TYPE4)(learningRate) * delta4 + (TYPE4)(beta) * momentum4;
|
|
weights4 += momentum4;
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(momentum, momentum4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AdaGradUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
__global TYPE *momentum,
|
|
int total, int batch_size,
|
|
TYPE learningRate,
|
|
TYPE Lambda1, TYPE Lambda2)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
//---
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
|
|
//---
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
momentum4 = momentum4 + pow(delta4, 2);
|
|
weights4 += learningRate / sqrt(momentum4 + 1.0e-37f);
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(momentum, momentum4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void RMSPropUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
__global TYPE *momentum,
|
|
int total, int batch_size,
|
|
TYPE learningRate,
|
|
TYPE beta,
|
|
TYPE Lambda1, TYPE Lambda2)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
//---
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
|
|
//---
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
momentum4 = beta * momentum4 + (1 - beta) * pow(delta4, 2);
|
|
weights4 += delta4 * learningRate / (sqrt(momentum4) + 1.0e-37f);
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(momentum, momentum4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AdaDeltaUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
__global TYPE *momentumW,
|
|
__global TYPE *momentumG,
|
|
int total, int batch_size,
|
|
TYPE beta1, TYPE beta2,
|
|
TYPE Lambda1, TYPE Lambda2)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
//---
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE4 momentumW4 = ToVect4(momentumW, start, 1, total, 0);
|
|
TYPE4 momentumG4 = ToVect4(momentumG, start, 1, total, 0);
|
|
//---
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
momentumW4 = beta1 * momentumW4 + (1 - beta1) * pow(weights4, 2);
|
|
momentumG4 = beta2 * momentumG4 + (1 - beta2) * pow(delta4, 2);
|
|
weights4 += delta4 * sqrt(momentumW4) / (sqrt(momentumG4) + 1.0e-37f);
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(momentumW, momentumW4, start, 1, total, 0);
|
|
D4ToArray(momentumG, momentumG4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AdamUpdate(__global TYPE *delta_weights,
|
|
__global TYPE *weights,
|
|
__global TYPE *momentumM,
|
|
__global TYPE *momentumV,
|
|
int total, int batch_size,
|
|
TYPE learningRate,
|
|
TYPE beta1, TYPE beta2,
|
|
TYPE Lambda1, TYPE Lambda2)
|
|
{
|
|
int start = 4 * get_global_id(0);
|
|
//---
|
|
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
|
|
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
|
|
TYPE4 momentumM4 = ToVect4(momentumM, start, 1, total, 0);
|
|
TYPE4 momentumV4 = ToVect4(momentumV, start, 1, total, 0);
|
|
//---
|
|
momentumM4 = beta1 * momentumM4 + (1 - beta1) * delta4;
|
|
momentumV4 = beta2 * momentumV4 + (1 - beta2) * pow(delta4, 2);
|
|
TYPE4 m = momentumM4 / (1 - beta1);
|
|
TYPE4 v = momentumV4 / (1 - beta2);
|
|
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
|
|
weights4 += learningRate * m / (sqrt(v) + 1.0e-37f);
|
|
D4ToArray(weights, weights4, start, 1, total, 0);
|
|
D4ToArray(momentumM, momentumM4, start, 1, total, 0);
|
|
D4ToArray(momentumV, momentumV4, start, 1, total, 0);
|
|
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Feed-forward kernel of the convolutional layer |
|
|
//| Parameters: 'inputs' input data vector |
|
|
//| 'weights' weight matrix |
|
|
//| 'outputs' vector of results |
|
|
//| 'inputs_total' size of input data vector |
|
|
//| 'window' size of input data analysis window |
|
|
//| 'step' window moving step |
|
|
//| 'window_out' number of filters |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void ConvolutionFeedForward(__global TYPE *inputs,
|
|
__global TYPE *weights,
|
|
__global TYPE *outputs,
|
|
int inputs_total,
|
|
int window,
|
|
int step,
|
|
int window_out,
|
|
int transposed_out)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int neurons = get_global_size(0);
|
|
const int weights_total = (window + 1) * window_out;
|
|
int shift = n * step;
|
|
for(int w = 0; w < window_out; w++)
|
|
{
|
|
int out = (transposed_out == 1 ? w + n * window_out : w * neurons + n);
|
|
int shift_weights = w * (window + 1) ;
|
|
if((shift_weights + window) >= weights_total)
|
|
break;
|
|
TYPE s = weights[shift_weights + window];
|
|
for(int i = 0; i < window; i += 4)
|
|
s += dot(ToVect4(inputs, i, 1, inputs_total, shift),
|
|
ToVect4(weights, i, 1, shift_weights + window, shift_weights));
|
|
outputs[out] = s;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void ConvolutionCalcHiddenGradient(__global TYPE *gradient_inputs,
|
|
__global TYPE *weights,
|
|
__global TYPE *gradients,
|
|
int window,
|
|
int step,
|
|
int window_out,
|
|
int neurons,
|
|
int transposed_out)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int inputs_total = get_global_size(0);
|
|
int weights_total = (window + 1) * window_out;
|
|
//---
|
|
TYPE grad = 0;
|
|
int w_start = n % step;
|
|
int r_start = max((n - window + step) / step, 0);
|
|
int total = (window - w_start + step - 1) / step;
|
|
total = min((n + step) / step, total);
|
|
for(int i = 0; i < total; i ++)
|
|
{
|
|
int row = r_start + i;
|
|
if(row >= neurons)
|
|
break;
|
|
for(int wo = 0; wo < window_out; wo++)
|
|
{
|
|
int shift_g = (transposed_out == 1 ? row * window_out + wo : row + wo * neurons);
|
|
int shift_w = w_start + (total - i - 1) * step + wo * (window + 1);
|
|
grad += gradients[shift_g] * weights[shift_w];
|
|
}
|
|
}
|
|
gradient_inputs[n] = grad;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void ConvolutionCalcDeltaWeights(__global TYPE *inputs,
|
|
__global TYPE *delta_weights,
|
|
__global TYPE *gradients,
|
|
int inputs_total,
|
|
int step,
|
|
int neurons,
|
|
int transposed_out)
|
|
{
|
|
const int inp_w = get_global_id(0);
|
|
const int w = get_global_id(1);
|
|
const int window = get_global_size(0) - 1;
|
|
const int window_out = get_global_size(1);
|
|
//---
|
|
int shift_delt = w * (window + 1) + inp_w;
|
|
TYPE value = 0;
|
|
if(inp_w == window)
|
|
{
|
|
for(int n = 0; n < neurons; n ++)
|
|
value += gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
|
|
}
|
|
else
|
|
for(int n = 0; n < neurons; n ++)
|
|
{
|
|
int shift_inp = n * step + inp_w;
|
|
if(shift_inp >= inputs_total)
|
|
break;
|
|
value += inputs[shift_inp] * gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
|
|
}
|
|
delta_weights[shift_delt] += value;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Feed-forward kernel of the pooling layer |
|
|
//| Parameters: 'inputs' input data vector |
|
|
//| 'outputs' vector of results |
|
|
//| 'inputs_total' size of input data vector |
|
|
//| 'input_neurons' vector size of 1st input data filter |
|
|
//| 'window' size of input data analysis window |
|
|
//| 'step' window moving step |
|
|
//| 'activation' type of activation function |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void ProofFeedForward(__global TYPE *inputs,
|
|
__global TYPE *outputs,
|
|
int inputs_total,
|
|
int input_neurons,
|
|
int window,
|
|
int step,
|
|
int activation)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int w = get_global_id(1);
|
|
const int neurons = get_global_size(0);
|
|
const int window_out = get_global_size(1);
|
|
int shift = n * step;
|
|
int out = w * neurons + n;
|
|
int shift_inp = w * input_neurons;
|
|
TYPE s = 0;
|
|
TYPE k = (TYPE)1 / (TYPE)window;
|
|
TYPE4 k4 = (TYPE4)(k);
|
|
for(int i = 0; i < window; i += 4)
|
|
switch(activation)
|
|
{
|
|
case 0:
|
|
s += dot(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift),
|
|
k4);
|
|
break;
|
|
case 1:
|
|
s = Max4(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift), s);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
outputs[out] = s;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Backpropagation kernel of the pooling layer |
|
|
//| Parameters: 'inputs' input data vector |
|
|
//| 'gradient_inputs' previous layer gradients vector |
|
|
//| 'outputs' vector of results |
|
|
//| 'gradients' current layer gradients vector |
|
|
//| 'inputs_total' size of input data vector |
|
|
//| 'outputs_total' size of outputs vector |
|
|
//| 'window' size of input data analysis window |
|
|
//| 'step' window moving step |
|
|
//| 'neurons' vector size if 1st filter of outputs |
|
|
//| 'activation' type of activation function |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void ProofCalcHiddenGradient(__global TYPE *inputs,
|
|
__global TYPE *gradient_inputs,
|
|
__global TYPE *outputs,
|
|
__global TYPE *gradients,
|
|
int inputs_total,
|
|
int outputs_total,
|
|
int window,
|
|
int step,
|
|
int neurons,
|
|
int activation)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int w = get_global_id(1);
|
|
const int input_neurons = get_global_size(0);
|
|
const int window_out = get_global_size(1);
|
|
//---
|
|
int start = max((n - window + step) / step, 0);
|
|
int stop = min((n + step - 1) / step + 1, neurons);
|
|
TYPE grad = 0;
|
|
int shift_inp = w * input_neurons + n;
|
|
if(shift_inp >= inputs_total)
|
|
return;
|
|
TYPE inp = inputs[shift_inp];
|
|
int shift_out = w * neurons;
|
|
for(int o = start; o < stop; o ++)
|
|
{
|
|
int shift_g = shift_out + o;
|
|
if(shift_g >= outputs_total)
|
|
break;
|
|
switch(activation)
|
|
{
|
|
case 0:
|
|
grad += gradients[shift_g] / (TYPE)window;
|
|
break;
|
|
case 1:
|
|
grad += (outputs[shift_g] == inp ? gradients[shift_g] : 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
gradient_inputs[shift_inp] = grad;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| LSTM block feed-forward kernel |
|
|
//| Parameters: 'forgetgate' forget gate |
|
|
//| 'inputgate' input gate |
|
|
//| 'outputgate' output gate |
|
|
//| 'newcontent' new content |
|
|
//| 'memory' memory stream |
|
|
//| 'hiddenstate' hidden state stream |
|
|
//| 'outputs_total' number of elements in data stream |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LSTMFeedForward(__global TYPE *forgetgate,
|
|
__global TYPE *inputgate,
|
|
__global TYPE *outputgate,
|
|
__global TYPE *newcontent,
|
|
__global TYPE *memory,
|
|
__global TYPE *hiddenstate,
|
|
int outputs_total)
|
|
{
|
|
const int n = get_global_id(0);
|
|
const int shift = n * 4;
|
|
TYPE4 fg = ToVect4(forgetgate, shift, 1, outputs_total, 0);
|
|
TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
|
|
TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
|
|
TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
|
|
TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
|
|
//---
|
|
TYPE4 temp = mem * fg;
|
|
temp += ig * nc;
|
|
D4ToArray(memory, temp, shift, 1, outputs_total, 0);
|
|
temp = tanh(temp) * og;
|
|
D4ToArray(hiddenstate, temp, shift, 1, outputs_total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| LSTM block backpropagation kernel |
|
|
//| Parameters: 'outputs' vector of outputs |
|
|
//| 'gradients' current layer gradients vector |
|
|
//| 'inputgate' input gate |
|
|
//| 'outputgate' output gate |
|
|
//| 'newcontent' new content |
|
|
//| 'memory' memory stream |
|
|
//| 'fg_gradients' forget gate gradient |
|
|
//| 'ig_gradients' input gate gradient |
|
|
//| 'og_gradients' output gate gradient |
|
|
//| 'nc_gradients' new content gradient |
|
|
//| 'outputs_total' size of outputs vector |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LSTMCalcHiddenGradient(__global TYPE *outputs,
|
|
__global TYPE *gradients,
|
|
__global TYPE *inputgate,
|
|
__global TYPE *outputgate,
|
|
__global TYPE *newcontent,
|
|
__global TYPE *memory,
|
|
__global TYPE *fg_gradients,
|
|
__global TYPE *ig_gradients,
|
|
__global TYPE *og_gradients,
|
|
__global TYPE *nc_gradients,
|
|
int outputs_total)
|
|
{
|
|
const int n = get_global_id(0);
|
|
int shift = n * 4;
|
|
//---
|
|
TYPE4 out = ToVect4(outputs, shift, 1, outputs_total, 0);
|
|
TYPE4 grad = ToVect4(gradients, shift, 1, outputs_total, 0);
|
|
TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
|
|
TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
|
|
TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
|
|
TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
|
|
//---
|
|
TYPE4 m = out / (og + 1.0e-37f);
|
|
//--- OutputGate gradient
|
|
TYPE4 temp = grad * m;
|
|
D4ToArray(og_gradients, temp, shift, 1, outputs_total, 0);
|
|
//--- Градиент памяти cкорректируем на производную TANH
|
|
grad = grad * og * (1 - pow(m, 2));
|
|
//--- InputGate gradient
|
|
temp = grad * nc;
|
|
D4ToArray(ig_gradients, temp, shift, 1, outputs_total, 0);
|
|
//--- NewContent gradient
|
|
temp = grad * ig;
|
|
D4ToArray(nc_gradients, temp, shift, 1, outputs_total, 0);
|
|
//--- ForgetGates gradient
|
|
temp = grad * mem;
|
|
D4ToArray(fg_gradients, temp, shift, 1, outputs_total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Kernel calculating Self-Attention block dependency coefficients |
|
|
//| Parameters: 'querys' tensor of queries |
|
|
//| 'keys' tensor of keys |
|
|
//| 'values' tensor of values |
|
|
//| 'scores' matrix of dependency coefficients |
|
|
//| 'inputs' tensor of input data |
|
|
//| 'outputs' tensor of output data |
|
|
//| 'window' size of input data analysis window |
|
|
//| 'key_size' size of key vector for one element |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AttentionFeedForward(__global TYPE *querys,
|
|
__global TYPE *keys,
|
|
__global TYPE *scores,
|
|
__global TYPE *values,
|
|
__global TYPE *outputs,
|
|
int window,
|
|
int key_size,
|
|
int mask)
|
|
{
|
|
const int q = get_global_id(0);
|
|
const int units = get_global_size(0);
|
|
const int h = get_global_id(1);
|
|
const int heads = get_global_size(1);
|
|
int shift_query = key_size * (q * heads + h);
|
|
int shift_scores = units * (q * heads + h);
|
|
TYPE summ = 0;
|
|
for(int s = 0; s < units; s++)
|
|
{
|
|
TYPE score = 0;
|
|
if(mask > 0 && s > q)
|
|
{
|
|
scores[shift_scores + s] = score;
|
|
continue;
|
|
}
|
|
int shift_key = key_size * (s * heads + h);
|
|
for(int k = 0; k < key_size; k ++)
|
|
score += querys[shift_query + k] * keys[shift_key + k];
|
|
score = exp(score / sqrt((TYPE)key_size));
|
|
summ += score;
|
|
scores[shift_scores + s] = score;
|
|
}
|
|
for(int s = 0; s < units; s++)
|
|
scores[shift_scores + s] /= summ;
|
|
//---
|
|
shift_query = window * (q * heads + h);
|
|
for(int i = 0; i < window; i++)
|
|
{
|
|
TYPE query = 0;
|
|
for(int v = 0; v < units; v++)
|
|
query += values[window * (v * heads + h) + i] * scores[shift_scores + v];
|
|
outputs[shift_query + i] = query;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Kernel to propagate gradient inside the Self-Attention block |
|
|
//| up to the level of dependency coefficient matrix Score |
|
|
//| Parameters: 'values' tensor of values |
|
|
//| 'values_grad' tensor of gradients at level of values |
|
|
//| 'scores' matrix of dependency coefficients |
|
|
//| 'scores_grad' matrix of dependency coeff. matrix |
|
|
//| 'outputs' tensor of output data |
|
|
//| 'outputs_grad' tensor of gradients at output level |
|
|
//| 'window' size of input data analysis window |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AttentionCalcScoreGradient(__global TYPE *scores,
|
|
__global TYPE *scores_grad,
|
|
__global TYPE *values,
|
|
__global TYPE *values_grad,
|
|
__global TYPE *outputs_grad,
|
|
__global TYPE *scores_temp,
|
|
int window)
|
|
{
|
|
const int q = get_global_id(0);
|
|
const int units = get_global_size(0);
|
|
const int h = get_global_id(1);
|
|
const int heads = get_global_size(1);
|
|
int shift_value = window * (q * heads + h);
|
|
int shift_score = units * (q * heads + h);
|
|
//--- Gradient propagation to Values
|
|
for(int i = 0; i < window; i ++)
|
|
{
|
|
TYPE grad = 0;
|
|
for(int g = 0; g < units; g++)
|
|
grad += scores[units * (g * heads + h) + q] * outputs_grad[window * (g * heads + h) + i];
|
|
values_grad[shift_value + i] = grad;
|
|
}
|
|
//--- Gradient propagation to Score
|
|
for(int k = 0; k < units; k++)
|
|
{
|
|
TYPE grad = 0;
|
|
for(int i = 0; i < window; i++)
|
|
grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
|
|
scores_temp[shift_score + k] = grad;
|
|
}
|
|
//--- Adjust by Softmax derivative
|
|
for(int k = 0; k < units; k++)
|
|
{
|
|
TYPE grad = 0;
|
|
TYPE score = scores[shift_score + k];
|
|
for(int i = 0; i < units; i++)
|
|
grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
|
|
scores_grad[shift_score + k] = grad;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void AttentionCalcHiddenGradient(__global TYPE *querys,
|
|
__global TYPE *querys_grad,
|
|
__global TYPE *keys,
|
|
__global TYPE *keys_grad,
|
|
__global TYPE *scores_grad,
|
|
int key_size)
|
|
{
|
|
const int q = get_global_id(0);
|
|
const int units = get_global_size(0);
|
|
const int h = get_global_id(1);
|
|
const int heads = get_global_size(1);
|
|
int shift_query = key_size * (q * heads + h);
|
|
int shift_score = units * (q * heads + h);
|
|
//--- Gradient prpagation to Querys and Keys
|
|
const TYPE k = 1 / sqrt((TYPE)key_size);
|
|
//---
|
|
for(int i = 0; i < key_size; i++)
|
|
{
|
|
TYPE grad_q = 0;
|
|
TYPE grad_k = 0;
|
|
for(int s = 0; s < units; s++)
|
|
{
|
|
grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
|
|
grad_k += querys[key_size * (s * heads + h) + i] * scores_grad[units * (s * heads + h) + q];
|
|
}
|
|
querys_grad[shift_query + i] = grad_q * k;
|
|
keys_grad[shift_query + i] = grad_k * k;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void GPTFeedForward(__global TYPE *querys,
|
|
__global TYPE *keys,
|
|
__global TYPE *scores,
|
|
__global TYPE *values,
|
|
__global TYPE *outputs,
|
|
int key_size,
|
|
int units,
|
|
int current)
|
|
{
|
|
const int h = get_global_id(0);
|
|
const int heads = get_global_size(0);
|
|
int shift_query = key_size * h;
|
|
int shift_scores = units * h;
|
|
TYPE summ = 0;
|
|
for(int s = 0; s < units; s++)
|
|
{
|
|
TYPE score = 0;
|
|
int shift_key = key_size * (s * heads + h);
|
|
for(int k = 0; k < key_size; k ++)
|
|
{
|
|
if(s == current)
|
|
keys[shift_key + k] = querys[shift_query + k + heads * key_size];
|
|
score += querys[shift_query + k] * keys[shift_key + k];
|
|
}
|
|
score = exp(score / sqrt((TYPE)key_size));
|
|
summ += score;
|
|
scores[shift_scores + s] = score;
|
|
}
|
|
for(int s = 0; s < units; s++)
|
|
scores[shift_scores + s] /= summ;
|
|
//---
|
|
shift_query = key_size * h;
|
|
for(int i = 0; i < key_size; i++)
|
|
{
|
|
TYPE query = 0;
|
|
for(int v = 0; v < units; v++)
|
|
{
|
|
if(v == current)
|
|
values[key_size * (v * heads + h) + i] = querys[(2 * heads + h) * key_size + i];
|
|
query += values[key_size * (v * heads + h) + i] * scores[shift_scores + v];
|
|
}
|
|
outputs[shift_query + i] = query;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void GPTCalcScoreGradient(__global TYPE *scores,
|
|
__global TYPE *scores_grad,
|
|
__global TYPE *values,
|
|
__global TYPE *values_grad,
|
|
__global TYPE *outputs_grad,
|
|
__global TYPE *scores_temp,
|
|
int window,
|
|
int units,
|
|
int current)
|
|
{
|
|
const int h = get_global_id(0);
|
|
const int heads = get_global_size(0);
|
|
int shift_value = window * (2 * heads + h);
|
|
int shift_score = units * h;
|
|
//--- Gradient propagation to Values
|
|
for(int i = 0; i < window; i ++)
|
|
values_grad[shift_value + i] = scores[units * h + current] * outputs_grad[window * h + i];
|
|
//--- Gradient propagation to Score
|
|
for(int k = 0; k < units; k++)
|
|
{
|
|
TYPE grad = 0;
|
|
for(int i = 0; i < window; i++)
|
|
grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
|
|
scores_temp[shift_score + k] = grad;
|
|
}
|
|
//--- Adjust by Softmax derivative
|
|
for(int k = 0; k < units; k++)
|
|
{
|
|
TYPE grad = 0;
|
|
TYPE score = scores[shift_score + k];
|
|
for(int i = 0; i < units; i++)
|
|
grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
|
|
scores_grad[shift_score + k] = grad;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void GPTCalcHiddenGradient(__global TYPE *querys,
|
|
__global TYPE *querys_grad,
|
|
__global TYPE *keys,
|
|
__global TYPE *scores_grad,
|
|
int key_size,
|
|
int units,
|
|
int current)
|
|
{
|
|
const int h = get_global_id(0);
|
|
const int heads = get_global_size(0);
|
|
int shift_query = key_size * h;
|
|
int shift_key = key_size * (heads + h);
|
|
int shift_score = units * h;
|
|
//--- Gradient prpagation to Querys and Keys
|
|
const TYPE k = 1 / sqrt((TYPE)key_size);
|
|
//---
|
|
for(int i = 0; i < key_size; i++)
|
|
{
|
|
TYPE grad_q = 0;
|
|
TYPE grad_k = 0;
|
|
for(int s = 0; s < units; s++)
|
|
{
|
|
grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
|
|
if(s == current)
|
|
grad_k += querys[key_size * h + i] * scores_grad[units * h + current];
|
|
}
|
|
querys_grad[shift_query + i] = grad_q * k;
|
|
querys_grad[shift_key + i] = grad_k * k;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void BatchNormFeedForward(__global TYPE *inputs,
|
|
__global TYPE *options,
|
|
__global TYPE *weights,
|
|
__global TYPE *output,
|
|
int batch,
|
|
int total)
|
|
{
|
|
int n = get_global_id(0);
|
|
if(batch <= 1)
|
|
{
|
|
D4ToArray(output, ToVect4(inputs, n * 4, 1, total, 0), n * 4, 1, total, 0);
|
|
return;
|
|
}
|
|
int shift = n * 4;
|
|
int shift_options = n * 3 * 4;
|
|
int shift_weights = n * 2 * 4;
|
|
//---
|
|
TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
|
|
TYPE4 mean = ToVect4(options, shift, 3, total * 3, 0) * ((TYPE)batch - 1) + inp ;
|
|
if(options[shift_options ] > 0 && options[shift_options + 1] > 0)
|
|
mean /= (TYPE4)batch;
|
|
TYPE4 delt = inp - mean;
|
|
TYPE4 variance = ToVect4(options, shift, 3, total * 3, 1) * ((TYPE)batch - 1) + pow(delt, 2);
|
|
if(options[shift_options + 1] > 0)
|
|
variance /= (TYPE4)batch;
|
|
TYPE4 nx = delt / sqrt(variance + 1e-37f);
|
|
//---
|
|
if(weights[shift_weights] == 0)
|
|
D4ToArray(weights, (TYPE4)1, shift, 2, total * 2, 0);
|
|
//---
|
|
TYPE4 res = ToVect4(weights, shift, 2, total * 2, 0) * nx + ToVect4(weights, shift, 2, total * 2, 1);
|
|
//---
|
|
D4ToArray(options, mean, shift, 3, total * 3, 0);
|
|
D4ToArray(options, variance, shift, 3, total * 3, 1);
|
|
D4ToArray(options, nx, shift, 3, total * 3, 2);
|
|
D4ToArray(output, res, shift, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void BatchNormCalcHiddenGradient(__global TYPE *options,
|
|
__global TYPE *gradient,
|
|
__global TYPE *inputs,
|
|
__global TYPE *gradient_inputs,
|
|
__global TYPE *weights,
|
|
int batch,
|
|
int total
|
|
)
|
|
{
|
|
int n = get_global_id(0);
|
|
int shift = n * 4;
|
|
if(batch <= 1)
|
|
{
|
|
D4ToArray(gradient_inputs, ToVect4(gradient, shift, 1, total, 0), shift, 1, total, 0);
|
|
return;
|
|
}
|
|
//---
|
|
TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
|
|
TYPE4 gnx = ToVect4(gradient, shift, 1, total, 0) * ToVect4(weights, shift, 2, total * 2, 0);
|
|
TYPE4 temp = 1 / sqrt(ToVect4(options, shift, 3, total * 3, 1) + 1e-37f);
|
|
TYPE4 delt = inp - ToVect4(options, shift, 3, total * 3, 0);
|
|
TYPE4 gvar = delt / (-2 * pow(ToVect4(options, shift, 3, total * 3, 1) + 1.0e-37f, 3.0f / 2.0f)) * gnx;
|
|
TYPE4 gmu = (-temp) * gnx - gvar * 2 * delt / (TYPE4)batch;
|
|
TYPE4 gx = temp * gnx + gmu / (TYPE4)batch + gvar * 2 * delt / (TYPE4)batch;
|
|
//---
|
|
D4ToArray(gradient_inputs, gx, shift, 1, total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void BatchNormCalcDeltaWeights(__global TYPE *options,
|
|
__global TYPE *delta_weights,
|
|
__global TYPE *gradients)
|
|
{
|
|
const int n = get_global_id(0);
|
|
int shift_options = n * 3;
|
|
int shift_weights = n * 2;
|
|
//---
|
|
TYPE grad = gradients[n];
|
|
delta_weights[shift_weights] += grad * options[shift_options + 2];
|
|
delta_weights[shift_weights + 1] += grad;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void MaskMult(__global TYPE *inputs,
|
|
__global TYPE *mask,
|
|
__global TYPE *outputs,
|
|
int outputs_total)
|
|
{
|
|
const int n = get_global_id(0) * 4;
|
|
//---
|
|
TYPE4 out = ToVect4(inputs, n, 1, outputs_total, 0) * ToVect4(mask, n, 1, outputs_total, 0);
|
|
D4ToArray(outputs, out, n, 1, outputs_total, 0);
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void Sum(__global TYPE *inputs1,
|
|
__global TYPE *inputs2,
|
|
__global TYPE *outputs)
|
|
{
|
|
const int n = get_global_id(0);
|
|
//---
|
|
outputs[n] = inputs1[n] + inputs2[n];
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LayerNormalize(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
__global TYPE* stds,
|
|
const int total,
|
|
const int std_shift)
|
|
{
|
|
uint i = (uint)get_global_id(0);
|
|
uint l = (uint)get_local_id(0);
|
|
uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
|
|
__local TYPE temp[LOCAL_SIZE];
|
|
//---
|
|
uint count = 0;
|
|
do
|
|
{
|
|
uint shift = count * ls + l;
|
|
temp[l] = (count > 0 ? temp[l] : 0) + (shift < total ? inputs[shift] : 0);
|
|
count++;
|
|
}
|
|
while((count * ls + l) < total);
|
|
temp[l] /= (TYPE)total;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
count = ls;
|
|
do
|
|
{
|
|
count = (count + 1) / 2;
|
|
temp[l] += (l < count ? temp[l + count] : 0);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
while(count > 1);
|
|
//---
|
|
TYPE mean = (TYPE) temp[0];
|
|
//---
|
|
count = 0;
|
|
do
|
|
{
|
|
uint shift = count * ls + l;
|
|
temp[l] = (count > 0 ? temp[l] : 0) + (shift < total ? (TYPE)pow(inputs[shift] - mean, 2) : 0);
|
|
count++;
|
|
}
|
|
while((count * ls + l) < total);
|
|
temp[l] /= (TYPE)total;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
count = ls;
|
|
do
|
|
{
|
|
count = (count + 1) / 2;
|
|
temp[l] += (l < count ? temp[l + count] : 0);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
while(count > 1);
|
|
//---
|
|
TYPE std = (TYPE)sqrt(temp[0]);
|
|
if(l == 0)
|
|
stds[std_shift] = std;
|
|
count = 0;
|
|
while((count * ls + l) < total)
|
|
{
|
|
uint shift = count * ls + l;
|
|
outputs[shift] = (inputs[shift] - mean) / (std + 1e-37f);
|
|
count++;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void LayerNormalizeGradient(__global TYPE* outputs,
|
|
__global TYPE* out_gradient,
|
|
__global TYPE* inp_gradient,
|
|
__global TYPE* stds,
|
|
const int total,
|
|
const int std_shift)
|
|
{
|
|
uint i = (uint)get_global_id(0);
|
|
uint l = (uint)get_local_id(0);
|
|
//---
|
|
uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
|
|
__local TYPE dSTD[LOCAL_SIZE];
|
|
__local TYPE dMean1[LOCAL_SIZE];
|
|
__local TYPE dMean2[LOCAL_SIZE];
|
|
uint count = 0;
|
|
do
|
|
{
|
|
uint shift = count * ls + l;
|
|
dSTD[l] = (count > 0 ? dSTD[l] : 0) - (shift < total ? out_gradient[shift] * outputs[shift] / (2 * (pow(stds[std_shift], (TYPE)2) + 1e-37f)) : 0);
|
|
dMean1[l] = (count > 0 ? dMean1[l] : 0) - (shift < total ? out_gradient[shift] / (stds[std_shift] + 1e-37f) : 0);
|
|
dMean2[l] = (count > 0 ? dMean2[l] : 0) - (shift < total ? 2 * outputs[shift] * stds[std_shift] / (TYPE)total : 0);
|
|
count++;
|
|
}
|
|
while((count * ls + l) < total);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
count = ls;
|
|
do
|
|
{
|
|
count = (count + 1) / 2;
|
|
dSTD[l] += (l < count ? dSTD[l + count] : 0);
|
|
dMean1[l] += (l < count ? dMean1[l + count] : 0);
|
|
dMean2[l] += (l < count ? dMean2[l + count] : 0);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
while(count > 1);
|
|
//---
|
|
TYPE dstd = dSTD[0];
|
|
TYPE dmean = dMean1[0] + dstd * dMean2[0];
|
|
//---
|
|
count = 0;
|
|
while((count * ls + l) < total)
|
|
{
|
|
uint shift = count * ls + l;
|
|
inp_gradient[shift] = out_gradient[shift] / (stds[std_shift] + 1e-32f) + (2 * dstd * outputs[shift] * stds[std_shift] + dmean) / total;
|
|
count++;
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SoftMaxActivation(__global TYPE* inputs,
|
|
__global TYPE* outputs,
|
|
const ulong total)
|
|
{
|
|
uint i = (uint)get_global_id(0);
|
|
uint l = (uint)get_local_id(0);
|
|
uint h = (uint)get_global_id(1);
|
|
uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
|
|
//---
|
|
__local TYPE temp[LOCAL_SIZE];
|
|
uint count = 0;
|
|
for(count = l; (count < total && l < ls); count += ls)
|
|
{
|
|
uint shift = h * total + count;
|
|
temp[l] = (count > l ? temp[l] : 0) + exp(inputs[shift]);
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
count = ls;
|
|
do
|
|
{
|
|
count = (count + 1) / 2;
|
|
temp[l] += (l < count && (l + count) < ls ? temp[l + count] : 0);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
while(count > 1);
|
|
//---
|
|
TYPE sum = temp[0];
|
|
for(count = l; count < total; count += ls)
|
|
{
|
|
uint shift = h * total + count;
|
|
outputs[shift] = exp(inputs[shift]) / (sum + 1e-37f);
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| Derivative of SoftMax activation function |
|
|
//| Parameter 'outputs' vector of previous FF outputs data |
|
|
//| 'gradients' vector of gradients |
|
|
//| 'outputs_total' size of outputs vector |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void SoftMaxDerivative(__global TYPE* outputs,
|
|
__global TYPE* output_gr,
|
|
__global TYPE* input_gr)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
size_t outputs_total = get_global_size(0);
|
|
size_t shift = get_global_id(1) * outputs_total;
|
|
TYPE output = outputs[shift + i];
|
|
TYPE result = 0;
|
|
for(int j = 0; j < outputs_total; j++)
|
|
result += output * (i == j ? 1 - output : -outputs[shift + j]) * output_gr[shift + j];
|
|
input_gr[shift + i] = result;
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void Split(__global TYPE* source,
|
|
__global TYPE* target1,
|
|
__global TYPE* target2,
|
|
const int total_source,
|
|
const int total_target1,
|
|
const int total_target2
|
|
)
|
|
{
|
|
int n = get_global_id(0);
|
|
int total = get_global_size(0);
|
|
for(int i = n; i < total_source; i += total)
|
|
{
|
|
if(i < total_target1)
|
|
target1[i] = source[i];
|
|
else
|
|
{
|
|
int t2 = i - total_target1;
|
|
if(t2 < total_target2)
|
|
target2[t2] = source[i];
|
|
}
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|
|
//| |
|
|
//+------------------------------------------------------------------+
|
|
__kernel void Concatenate(__global TYPE* source1,
|
|
__global TYPE* source2,
|
|
__global TYPE* target,
|
|
const int total_source1,
|
|
const int total_source2,
|
|
const int total_target
|
|
)
|
|
{
|
|
int n = get_global_id(0);
|
|
int total = get_global_size(0);
|
|
for(int i = n; i < total_target; i += total)
|
|
{
|
|
if(i < total_source1)
|
|
target[i] = source1[i];
|
|
else
|
|
{
|
|
int t2 = i - total_source1;
|
|
if(t2 < total_source2)
|
|
target[i] = source2[t2];
|
|
}
|
|
}
|
|
}
|
|
//+------------------------------------------------------------------+
|