NeuroNetworksBook/Include/realization/opencl_program.cl
super.admin 4a9222852c convert
2025-05-30 16:12:34 +02:00

1182 lines
52 KiB
Common Lisp
Raw Permalink Blame History

//+------------------------------------------------------------------+
//| opencl_program.cl |
//| Copyright 2021, MetaQuotes Ltd. |
//| https://www.mql5.com |
//+------------------------------------------------------------------+
//--- by default some GPU doesn't support TYPEs
//--- cl_khr_fp64 directive is used to enable work with TYPEs
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
//+------------------------------------------------------------------+
//| Activation functions |
//+------------------------------------------------------------------+
//| Linear activation function |
//| Parameter 'value' Weighted sum of initial data |
//| 'a' defines the angle of inclination of the line |
//| 'b' - the vertical offset of the line |
//+------------------------------------------------------------------+
TYPE LineActivation(TYPE value, TYPE a, TYPE b)
{
return (a * value + b);
}
//+------------------------------------------------------------------+
//| Sigmoid activation function |
//| Parameter 'value' Weighted sum of initial data |
//| 'a' stretches the range of values of the function |
//| from '0' to 'a' |
//| 'b' shifts the resulting value |
//+------------------------------------------------------------------+
TYPE SigmoidActivation(TYPE value, TYPE a, TYPE b)
{
return (a / (1 + exp(-value)) - b);
}
//---
TYPE4 SigmoidActivation4(TYPE4 value, TYPE a, TYPE b)
{
return (a / (1 + exp(-value)) - b);
}
//+------------------------------------------------------------------+
//| Derivative of Sigmoid activation function |
//| Parameter 'value' current point (result of feed forward). |
//| 'a' stretches the range of values of the function from |
//| '0' to 'a' |
//| 'b' shifts the resulting value |
//+------------------------------------------------------------------+
TYPE4 SigmoidDerivative(TYPE4 value, TYPE a, TYPE b)
{
if(a == 0)
return (TYPE4)(0);
//---
TYPE4 z = max(min((TYPE4)a, value + b), (TYPE4)(0));
return (z * (1 - z / a));
}
//+------------------------------------------------------------------+
//| TANH activation function |
//| Parameter 'value' Weighted sum of initial data |
//+------------------------------------------------------------------+
TYPE TanhActivation(TYPE value)
{
return tanh(value);
}
//+------------------------------------------------------------------+
//| Derivative of TANH activation function |
//| Parameter 'value' current point (result of feed forward). |
//+------------------------------------------------------------------+
TYPE4 TanhDerivative(TYPE4 value)
{
return (1 - pow(value, 2));
}
//+------------------------------------------------------------------+
//| LReLU activation function |
//| Parameter 'value' current point (result of feed forward). |
//| 'a' leak parameter |
//+------------------------------------------------------------------+
TYPE LReLUActivation(TYPE value, TYPE a)
{
return (value > 0 ? value : a * value);
}
//+------------------------------------------------------------------+
//| Derivative of LReLU activation function |
//| Parameter 'value' current point (result of feed forward). |
//| 'a' leak parameter |
//+------------------------------------------------------------------+
TYPE4 LReLUDerivative(TYPE4 value, TYPE a)
{
TYPE4 result = (TYPE4)((value.s0 > 0 ? 1 : a), (value.s1 > 0 ? 1 : a), (value.s2 > 0 ? 1 : a), (value.s3 > 0 ? 1 : a));
return (result);
}
//+------------------------------------------------------------------+
//| Swish activation function |
//| Parameter 'value' Weighted sum of initial data |
//| 'b' affects the nonlinearity of the function |
//+------------------------------------------------------------------+
TYPE SwishActivation(TYPE value, TYPE b)
{
return value * SigmoidActivation(b * value, 1, 0);
}
//+------------------------------------------------------------------+
//| Derivative of Swish activation function |
//| Parameter 'value' current point (result of feed forward). |
//| 'value_input' Weighted sum of initial data |
//| 'b' affects the nonlinearity of the function |
//+------------------------------------------------------------------+
TYPE4 SwishDerivative(TYPE4 value, TYPE4 input_value, TYPE b)
{
//---
TYPE4 by = b * value;
return (by + SigmoidActivation4(b * input_value, 1, 0) * (1 - by));
}
//+------------------------------------------------------------------+
//| Transfer 4 elements of TYPE vector to TYPE4 |
//| Parameter 'array' source array of data |
//| 'start' first position to copy |
//| 'step' step between elements to copy |
//| 'size' Size of source array |
//| 'shift' Shift in source array to the 1-st copied element |
//+------------------------------------------------------------------+
TYPE4 ToVect4(__global TYPE *array, int start, int step, int size, int shift)
{
TYPE4 result = (TYPE4)(0, 0, 0, 0);
step = max(1, step);
int st = start * step + shift;
if(st < size)
{
int k = (size - shift) % step;
k = (size - shift - k) / step - start + (k > 0 ? 1 : 0);
switch(k)
{
case 0:
break;
case 1:
result = (TYPE4)(array[st], 0, 0, 0);
break;
case 2:
result = (TYPE4)(array[st], array[st + step], 0, 0);
break;
case 3:
result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], 0);
break;
default:
result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], array[st + 3 * step]);
break;
}
}
return result;
}
//+------------------------------------------------------------------+
//| Derivative of SoftMax activation function |
//| Parameter 'outputs' vector of previous FF outputs data |
//| 'gradients' vector of gradients |
//| 'shift' shift to first position in vector |
//| 'outputs_total' size of outputs vector |
//+------------------------------------------------------------------+
TYPE4 SoftMaxDerivative(__global TYPE *outputs, __global TYPE *gradient, int shift, int outputs_total)
{
TYPE4 result = (TYPE4)0;
for(int j = 0; j < outputs_total; j++)
result += (TYPE4)(outputs[j] * gradient[j]) * ((TYPE4)(shift == j, (shift + 1) == j, (shift + 2) == j, (shift + 3) == j) - ToVect4(outputs, shift, 1, outputs_total, 0));
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
TYPE Max4(TYPE4 vect, TYPE value)
{
TYPE result = fmax(vect.s0, value);
result = fmax(vect.s1, result);
result = fmax(vect.s2, result);
return(fmax(vect.s3, result));
}
//+------------------------------------------------------------------+
//| Transfer TYPE4 to 4 elements of TYPE vector |
//| Parameter 'array' target array of data |
//| 'value' source TYPE4 vector |
//| 'start' first position to copy in target array |
//| 'step' step between elements in target array |
//| 'size' Size of target array |
//| 'shift' Shift in target array to the 1-st copied element |
//+------------------------------------------------------------------+
void D4ToArray(__global TYPE *array, TYPE4 value, int start, int step, int size, int shift)
{
step = max(1, step);
int st = start * step + shift;
if(st < size)
{
int k = (size - shift) % step;
k = (size - shift - k) / step - start + (k > 0 ? 1 : 0);
switch(k)
{
case 3:
array[st + 2 * step] = value.s2;
case 2:
array[st + step] = value.s1;
case 1:
array[st] = value.s0;
break;
default:
array[st + 3 * step] = value.s3;
array[st + 2 * step] = value.s2;
array[st + step] = value.s1;
array[st] = value.s0;
break;
}
}
return;
}
//+------------------------------------------------------------------+
//| Kernel Feed Forward of perceptron |
//| Parameter 'inputs' vector of inputs data |
//| 'weights' matrix of weights |
//| 'sums' vector of multiploer inputs to weights |
//| 'outputs' output data after activation function |
//| 'inputs_total' size of inputs vector |
//| 'activation' type of activation function |
//| 'act_param_a' 1st parameter of activation function |
//| 'act_param_b' 2nd parameter of activation function |
//+------------------------------------------------------------------+
__kernel void PerceptronFeedForward(__global TYPE *inputs,
__global TYPE *weights,
__global TYPE *sums,
__global TYPE *outputs,
int inputs_total, int activation,
TYPE act_param_a,
TYPE act_param_b)
{
const int n = get_global_id(0);
const int weights_total = get_global_size(0) * (inputs_total + 1);
int shift = n * (inputs_total + 1);
TYPE s = weights[shift + inputs_total];
for(int i = 0; i < inputs_total; i += 4)
s += dot(ToVect4(inputs, i, 1, inputs_total, 0), ToVect4(weights, i, 1, weights_total, shift));
switch(activation)
{
case 0:
outputs[n] = LineActivation(s, act_param_a, act_param_b);
break;
case 1:
outputs[n] = SigmoidActivation(s, act_param_a, act_param_b);
break;
case 2:
outputs[n] = TanhActivation(s);
break;
case 3:
outputs[n] = LReLUActivation(s, act_param_a);
break;
case 4:
outputs[n] = exp(s);
break;
case 5:
sums[n] = s;
outputs[n] = SwishActivation(s, act_param_a);
break;
default:
outputs[n] = s;
break;
}
}
//+------------------------------------------------------------------+
//| Kernel of divide vector to constant value |
//| Parameter 'inputs' vector of inputs data |
//| 'outputs' vector of output data |
//| 'inputs_total' size of inputs vector |
//| 'const_value' constant value |
//+------------------------------------------------------------------+
__kernel void Normalize(__global TYPE *inputs,
__global TYPE *outputs,
int inputs_total,
TYPE const_value)
{
const int n = get_global_id(0);
int shift = n * 4;
TYPE4 result = ToVect4(inputs, shift, 1, inputs_total, 0) / (TYPE4)(const_value);
D4ToArray(outputs, result, shift, 1, inputs_total, 0);
}
//+------------------------------------------------------------------+
//| Kernel of calculation output gradients |
//| Parameter 'target' vector of target data |
//| 'outputs' vector of previous FF outputs data |
//| 'gradients' vector of gradients |
//| 'outputs_total' size of outputs vector |
//+------------------------------------------------------------------+
__kernel void CalcOutputGradient(__global TYPE *target,
__global TYPE *outputs,
__global TYPE *gradients,
int outputs_total)
{
const int n = get_global_id(0);
TYPE4 result = ToVect4(target, n * 4, 1, outputs_total, 0) - ToVect4(outputs, n * 4, 1, outputs_total, 0);
D4ToArray(gradients, result, n * 4, 1, outputs_total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DeActivateGradient(__global TYPE *sums,
__global TYPE *outputs,
__global TYPE *gradients,
int outputs_total, int activation,
TYPE act_param_a,
TYPE act_param_b)
{
const int n = get_global_id(0);
int shift = n * 4;
TYPE4 grad = ToVect4(gradients, shift, 1, outputs_total, 0);
TYPE4 out = ToVect4(outputs, shift, 1, outputs_total, 0);
TYPE4 s = ToVect4(sums, shift, 1, outputs_total, 0);
switch(activation)
{
case 0:
grad = grad * act_param_a;
break;
case 1:
grad = grad * SigmoidDerivative(out, act_param_a, act_param_b);
break;
case 2:
grad = grad * TanhDerivative(out);
break;
case 3:
grad = grad * LReLUDerivative(out, act_param_a);
break;
case 4:
grad = SoftMaxDerivative(outputs, sums, shift, outputs_total);
break;
case 5:
grad = grad * SwishDerivative(out, s, act_param_a);
break;
default:
break;
}
D4ToArray(gradients, grad, shift, 1, outputs_total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradient(__global TYPE *gradient_inputs,
__global TYPE *weights,
__global TYPE *gradients,
int outputs_total)
{
const int n = get_global_id(0);
const int inputs_total = get_global_size(0);
int weights_total = (inputs_total + 1) * outputs_total;
//---
TYPE grad = 0;
for(int o = 0; o < outputs_total; o += 4)
grad += dot(ToVect4(gradients, o, 1, outputs_total, 0), ToVect4(weights, o, (inputs_total + 1), weights_total, n));
gradient_inputs[n] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcDeltaWeights(__global TYPE *inputs,
__global TYPE *delta_weights,
__global TYPE *gradients)
{
const int n = get_global_id(0);
const int outputs_total = get_global_size(0);
const int i = get_global_id(1);
const int inputs_total = get_global_size(1);
//---
TYPE grad = gradients[n];
int shift = n * (inputs_total + 1);
delta_weights[shift + i] = inputs[i] * grad + delta_weights[shift + i];
if(i == 0)
delta_weights[shift + inputs_total] += grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SGDUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
int total,
int batch_size,
TYPE learningRate,
TYPE Lambda1,
TYPE Lambda2
)
{
int start = 4 * get_global_id(0);
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0);
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE lr = learningRate / ((TYPE)batch_size);
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
weights4 += (TYPE4)(lr) * delta4;
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MomentumUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
__global TYPE *momentum,
int total, int batch_size,
TYPE learningRate,
TYPE beta,
TYPE Lambda1, TYPE Lambda2)
{
int start = 4 * get_global_id(0);
//---
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
momentum4 = (TYPE4)(learningRate) * delta4 + (TYPE4)(beta) * momentum4;
weights4 += momentum4;
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(momentum, momentum4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AdaGradUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
__global TYPE *momentum,
int total, int batch_size,
TYPE learningRate,
TYPE Lambda1, TYPE Lambda2)
{
int start = 4 * get_global_id(0);
//---
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
//---
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
TYPE4 G = momentum4 + pow(delta4, 2);
weights4 += learningRate / sqrt(G + 1.0e-10);
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(momentum, momentum4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RMSPropUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
__global TYPE *momentum,
int total, int batch_size,
TYPE learningRate,
TYPE beta,
TYPE Lambda1, TYPE Lambda2)
{
int start = 4 * get_global_id(0);
//---
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
//---
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
TYPE4 G = beta * momentum4 + (1 - beta) * pow(delta4, 2);
weights4 += learningRate / (sqrt(G) + 1.0e-10) * delta4;
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(momentum, momentum4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AdaDeltaUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
__global TYPE *momentumW,
__global TYPE *momentumG,
int total, int batch_size,
TYPE beta1, TYPE beta2,
TYPE Lambda1, TYPE Lambda2)
{
int start = 4 * get_global_id(0);
//---
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE4 momentumW4 = ToVect4(momentumW, start, 1, total, 0);
TYPE4 momentumG4 = ToVect4(momentumG, start, 1, total, 0);
//---
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
TYPE4 W = beta1 * momentumW4 + (1 - beta1) * pow(weights4, 2);
TYPE4 G = beta2 * momentumG4 + (1 - beta2) * pow(delta4, 2);
weights4 += sqrt(W) / (sqrt(G) + 1.0e-10) * delta4;
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(momentumW, momentumW4, start, 1, total, 0);
D4ToArray(momentumG, momentumG4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AdamUpdate(__global TYPE *delta_weights,
__global TYPE *weights,
__global TYPE *momentumM,
__global TYPE *momentumV,
int total, int batch_size,
TYPE learningRate,
TYPE beta1, TYPE beta2,
TYPE Lambda1, TYPE Lambda2)
{
int start = 4 * get_global_id(0);
//---
TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
TYPE4 momentumM4 = ToVect4(momentumM, start, 1, total, 0);
TYPE4 momentumV4 = ToVect4(momentumV, start, 1, total, 0);
//---
TYPE4 M = beta1 * momentumM4 + (1 - beta1) * delta4;
TYPE4 V = beta2 * momentumV4 + (1 - beta2) * pow(delta4, 2);
TYPE4 m = M / (1 - beta1);
TYPE4 v = V / (1 - beta2);
weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
weights4 += learningRate * m / (sqrt(v) + 1.0e-10);
D4ToArray(weights, weights4, start, 1, total, 0);
D4ToArray(momentumM, momentumM4, start, 1, total, 0);
D4ToArray(momentumV, momentumV4, start, 1, total, 0);
D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'inputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'weights' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> |
//| 'sums' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'inputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'window' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'step' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'window_out' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'activation' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'act_param_a' 1-<2D> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'act_param_b' 2-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void ConvolutionFeedForward(__global TYPE *inputs,
__global TYPE *weights,
__global TYPE *sums,
__global TYPE *outputs,
int inputs_total,
int window,
int step,
int window_out,
int activation,
TYPE act_param_a,
TYPE act_param_b,
int transposed_out)
{
const int n = get_global_id(0);
const int neurons = get_global_size(0);
const int weights_total = (window + 1) * window_out;
int shift = n * step;
for(int w = 0; w < window_out; w++)
{
int out = (transposed_out == 1 ? w + n * window_out : w * neurons + n);
int shift_weights = w * (window + 1) ;
if((shift_weights + window) >= weights_total)
break;
TYPE s = weights[shift_weights + window];
for(int i = 0; i < window; i += 4)
s += dot(ToVect4(inputs, i, 1, inputs_total, shift),
ToVect4(weights, i, 1, weights_total, shift_weights));
switch(activation)
{
case 0:
outputs[out] = LineActivation(s, act_param_a, act_param_b);
break;
case 1:
outputs[out] = SigmoidActivation(s, act_param_a, act_param_b);
break;
case 2:
outputs[out] = TanhActivation(s);
break;
case 3:
outputs[out] = LReLUActivation(s, act_param_a);
break;
case 4:
outputs[out] = exp(s);
break;
case 5:
sums[out] = s;
outputs[out] = SwishActivation(s, act_param_a);
break;
default:
outputs[out] = s;
break;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ConvolutionCalcHiddenGradient(__global TYPE *gradient_inputs,
__global TYPE *weights,
__global TYPE *gradients,
int outputs_total,
int window,
int step,
int window_out,
int neurons,
int transposed_out)
{
const int n = get_global_id(0);
const int inputs_total = get_global_size(0);
int weights_total = (window + 1) * window_out;
//---
TYPE grad = 0;
int start = n - window + step;
start = max((start - start % step) / step, 0);
int stop = min((n - n % step) / step + 1, neurons);
for(int w = 0; w < window_out; w++)
{
int shift_w = w * (window + 1);
if((shift_w + window) > weights_total)
break;
for(int o = start; o < stop; o ++)
{
int shift_wl = (stop - o - 1) * step + n % step + shift_w;
int shift_gl = (transposed_out == 1 ? w + o * window_out : w * neurons + o);
grad += gradients[shift_gl] * weights[shift_wl];
}
}
gradient_inputs[n] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ConcolutionCalcDeltaWeights(__global TYPE *inputs,
__global TYPE *delta_weights,
__global TYPE *gradients,
int inputs_total,
int step,
int neurons,
int transposed_out)
{
const int inp_w = get_global_id(0);
const int w = get_global_id(1);
const int window = get_global_size(0) - 1;
const int window_out = get_global_size(1);
// int weights_total = (window + 1) * window_out;
//---
int shift_delt = w * (window + 1) + inp_w;
TYPE value = 0;
if(inp_w == window)
{
for(int n = 0; n < neurons; n ++)
value += gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
}
else
for(int n = 0; n < neurons; n ++)
{
int shift_inp = n * step + inp_w;
if(shift_inp >= inputs_total)
break;
value += inputs[shift_inp] * gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
}
delta_weights[shift_delt] += value;
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'inputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'inputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'input_neurons' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 1-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD>.<2E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'window' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'step' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'window_out' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'activation' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void ProofFeedForward(__global TYPE *inputs,
__global TYPE *outputs,
int inputs_total,
int input_neurons,
int window,
int step,
int window_out,
int activation)
{
const int n = get_global_id(0);
const int neurons = get_global_size(0);
int shift = n * step;
for(int w = 0; w < window_out; w++)
{
int out = w * neurons + n;
int shift_inp = w * input_neurons;
TYPE s = 0;
TYPE k = 1.0 / (TYPE)window;
TYPE4 k4 = (TYPE4)(k);
for(int i = 0; i < window; i += 4)
switch(activation)
{
case 6:
s += dot(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift),
k4);
break;
case 7:
s = Max4(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift), s);
break;
default:
break;
}
outputs[out] = s;
}
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'inputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'gradient_inputs'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD>|
//| 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'inputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'window' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'step' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'window_out' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'neurons' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 1-<2D><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'activation' <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void ProofCalcHiddenGradient(__global TYPE *inputs,
__global TYPE *gradient_inputs,
__global TYPE *outputs,
__global TYPE *gradients,
int inputs_total,
int outputs_total,
int window,
int step,
int window_out,
int neurons,
int activation)
{
const int n = get_global_id(0);
const int input_neurons = get_global_size(0);
//---
int start = n - window + step;
start = max((start - start % step) / step, 0);
int stop = min((n - n % step) / step + 1, neurons);
for(int w = 0; w < window_out; w++)
{
TYPE grad = 0;
int shift_inp = w * input_neurons + n;
if(shift_inp >= inputs_total)
break;
TYPE inp = inputs[shift_inp];
for(int o = start; o < stop; o ++)
{
int shift_g = w * neurons + o;
if(shift_g >= outputs_total)
break;
switch(activation)
{
case 6:
grad += gradients[shift_g] / (TYPE)window;
break;
case 7:
grad += (outputs[shift_g] == inp ? gradients[shift_g] : 0);
break;
default:
break;
}
}
gradient_inputs[shift_inp] = grad;
}
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> LSTM <20><><EFBFBD><EFBFBD><EFBFBD> |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'forgetgate' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'inputgate' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputgate' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'newcontent' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'memory' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'hiddenstate' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void LSTMFeedForward(__global TYPE *forgetgate,
__global TYPE *inputgate,
__global TYPE *outputgate,
__global TYPE *newcontent,
__global TYPE *memory,
__global TYPE *hiddenstate,
int outputs_total)
{
const int n = get_global_id(0);
const int shift = n * 4;
TYPE4 fg = ToVect4(forgetgate, shift, 1, outputs_total, 0);
TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
//---
TYPE4 temp = mem * fg;
temp += ig * nc;
D4ToArray(memory, temp, shift, 1, outputs_total, 0);
temp = tanh(temp) * og;
D4ToArray(hiddenstate, temp, shift, 1, outputs_total, 0);
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> LSTM <20><><EFBFBD><EFBFBD><EFBFBD> |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'inputgate' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputgate' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'newcontent' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'memory' <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'fg_gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'ig_gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> |
//| 'og_gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'nc_gradients' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs_total' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void LSTMCalcHiddenGradient(__global TYPE *outputs,
__global TYPE *gradients,
__global TYPE *inputgate,
__global TYPE *outputgate,
__global TYPE *newcontent,
__global TYPE *memory,
__global TYPE *fg_gradients,
__global TYPE *ig_gradients,
__global TYPE *og_gradients,
__global TYPE *nc_gradients,
int outputs_total)
{
const int n = get_global_id(0);
int shift = n * 4;
//---
TYPE4 out = ToVect4(outputs, shift, 1, outputs_total, 0);
TYPE4 grad = ToVect4(gradients, shift, 1, outputs_total, 0);
TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
//---
TYPE4 m = out / (og + 1.0e-8);
//--- OutputGate <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
TYPE4 temp = grad * m;
D4ToArray(og_gradients, temp, shift, 1, outputs_total, 0);
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> c<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> TANH
grad = grad * og * (1 - pow(m, 2));
//--- InputGate <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
temp = grad * nc;
D4ToArray(ig_gradients, temp, shift, 1, outputs_total, 0);
//--- NewContent <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
temp = grad * ig;
D4ToArray(nc_gradients, temp, shift, 1, outputs_total, 0);
//--- ForgetGates <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
temp = grad * mem;
D4ToArray(fg_gradients, temp, shift, 1, outputs_total, 0);
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> Self-Attention |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'querys' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'keys' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'values' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'scores' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'inputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'window' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'key_size' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void AttentionFeedForward(__global TYPE *querys,
__global TYPE *keys,
__global TYPE *scores,
__global TYPE *values,
__global TYPE *outputs,
int window,
int key_size,
int mask)
{
const int q = get_global_id(0);
const int units = get_global_size(0);
const int h = get_global_id(1);
const int heads = get_global_size(1);
int shift_query = key_size * (q * heads + h);
int shift_scores = units * (q * heads + h);
TYPE summ = 0;
for(int s = 0; s < units; s++)
{
TYPE score = 0;
if(mask > 0 && s > q)
{
scores[shift_scores + s] = score;
continue;
}
int shift_key = key_size * (s * heads + h);
for(int k = 0; k < key_size; k ++)
score += querys[shift_query + k] * keys[shift_key + k];
score = exp(score / sqrt((TYPE)key_size));
summ += score;
scores[shift_scores + s] = score;
}
for(int s = 0; s < units; s++)
scores[shift_scores + s] /= summ;
//---
shift_query = window * (q * heads + h);
for(int i = 0; i < window; i++)
{
TYPE query = 0;
for(int v = 0; v < units; v++)
query += values[window * (v * heads + h) + i] * scores[shift_scores + v];
outputs[shift_query + i] = query;
}
}
//+------------------------------------------------------------------+
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> Self-Attention |
//| <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> Score |
//| <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 'values' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'values_grad' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'scores' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'scores_grad' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD>. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'outputs_grad' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//| 'window' <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> |
//+------------------------------------------------------------------+
__kernel void AttentionCalcScoreGradient(__global TYPE *scores,
__global TYPE *scores_grad,
__global TYPE *values,
__global TYPE *values_grad,
__global TYPE *outputs_grad,
__global TYPE *scores_temp,
int window)
{
const int q = get_global_id(0);
const int units = get_global_size(0);
const int h = get_global_id(1);
const int heads = get_global_size(1);
int shift_value = window * (q * heads + h);
int shift_score = units * (q * heads + h);
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Values
for(int i = 0; i < window; i ++)
{
TYPE grad = 0;
for(int g = 0; g < units; g++)
grad += scores[units * (g * heads + h) + q] * outputs_grad[window * (g * heads + h) + i];
values_grad[shift_value + i] = grad;
}
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Score
for(int k = 0; k < units; k++)
{
TYPE grad = 0;
for(int i = 0; i < window; i++)
grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
scores_temp[shift_score + k] = grad;
}
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> Softmax
for(int k = 0; k < units; k++)
{
TYPE grad = 0;
TYPE score = scores[shift_score + k];
for(int i = 0; i < units; i++)
grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
scores_grad[shift_score + k] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AttentionCalcHiddenGradient(__global TYPE *querys,
__global TYPE *querys_grad,
__global TYPE *keys,
__global TYPE *keys_grad,
__global TYPE *scores_grad,
int key_size)
{
const int q = get_global_id(0);
const int units = get_global_size(0);
const int h = get_global_id(1);
const int heads = get_global_size(1);
int shift_query = key_size * (q * heads + h);
int shift_score = units * (q * heads + h);
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Querys <EFBFBD> Keys
const TYPE k = 1 / sqrt((TYPE)key_size);
//---
for(int i = 0; i < key_size; i++)
{
TYPE grad_q = 0;
TYPE grad_k = 0;
for(int s = 0; s < units; s++)
{
grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
grad_k += querys[key_size * (s * heads + h) + i] * scores_grad[units * (s * heads + h) + q];
}
querys_grad[shift_query + i] = grad_q * k;
keys_grad[shift_query + i] = grad_k * k;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GPTFeedForward(__global TYPE *querys,
__global TYPE *keys,
__global TYPE *scores,
__global TYPE *values,
__global TYPE *outputs,
int key_size,
int units)
{
const int h = get_global_id(0);
const int heads = get_global_size(0);
int shift_query = key_size * h;
int shift_scores = units * h;
TYPE summ = 0;
for(int s = 0; s < units; s++)
{
TYPE score = 0;
int shift_key = key_size * (s * heads + h);
for(int k = 0; k < key_size; k ++)
score += querys[shift_query + k] * keys[shift_key + k];
score = exp(score / sqrt((TYPE)key_size));
summ += score;
scores[shift_scores + s] = score;
}
for(int s = 0; s < units; s++)
scores[shift_scores + s] /= summ;
//---
shift_query = key_size * h;
for(int i = 0; i < key_size; i++)
{
TYPE query = 0;
for(int v = 0; v < units; v++)
query += values[key_size * (v * heads + h) + i] * scores[shift_scores + v];
outputs[shift_query + i] = query;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GPTCalcScoreGradient(__global TYPE *scores,
__global TYPE *scores_grad,
__global TYPE *values,
__global TYPE *values_grad,
__global TYPE *outputs_grad,
__global TYPE *scores_temp,
int window,
int units,
int current)
{
const int h = get_global_id(0);
const int heads = get_global_size(0);
int shift_value = window * (2 * heads + h);
int shift_score = units * h;
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Values
for(int i = 0; i < window; i ++)
values_grad[shift_value + i] = scores[units * h + current] * outputs_grad[window * h + i];
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Score
for(int k = 0; k < units; k++)
{
TYPE grad = 0;
for(int i = 0; i < window; i++)
grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
scores_temp[shift_score + k] = grad;
}
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> Softmax
for(int k = 0; k < units; k++)
{
TYPE grad = 0;
TYPE score = scores[shift_score + k];
for(int i = 0; i < units; i++)
grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
scores_grad[shift_score + k] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GPTCalcHiddenGradient(__global TYPE *querys,
__global TYPE *querys_grad,
__global TYPE *keys,
__global TYPE *scores_grad,
int key_size,
int units,
int current)
{
const int h = get_global_id(0);
const int heads = get_global_size(0);
int shift_query = key_size * h;
int shift_key = key_size * (heads + h);
int shift_score = units * h;
//--- <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <EFBFBD><EFBFBD> Querys <EFBFBD> Keys
const TYPE k = 1 / sqrt((TYPE)key_size);
//---
for(int i = 0; i < key_size; i++)
{
TYPE grad_q = 0;
TYPE grad_k = 0;
for(int s = 0; s < units; s++)
{
grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
if(s == current)
grad_k += querys[key_size * h + i] * scores_grad[units * h + current];
}
querys_grad[shift_query + i] = grad_q * k;
querys_grad[shift_key + i] = grad_k * k;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void BatchNormFeedForward(__global TYPE *inputs,
__global TYPE *options,
__global TYPE *weights,
__global TYPE *output,
int batch,
int total)
{
int n = get_global_id(0);
if(batch <= 1)
{
D4ToArray(output, ToVect4(inputs, n * 4, 1, total, 0), n * 4, 1, total, 0);
return;
}
int shift = n * 4;
int shift_options = n * 3 * 4;
int shift_weights = n * 2 * 4;
//---
TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
TYPE4 mean = ToVect4(options, shift, 3, total * 3, 0) * ((TYPE)batch - 1.0) + inp ;
if(options[shift_options ] > 0 && options[shift_options + 1] > 0)
mean /= (TYPE4)batch;
TYPE4 delt = inp - mean;
TYPE4 variance = ToVect4(options, shift, 3, total * 3, 1) * ((TYPE)batch - 1.0) + pow(delt, 2);
if(options[shift_options + 1] > 0)
variance /= (TYPE4)batch;
TYPE4 nx = delt / sqrt(variance + 1e-6);
//---
if(weights[shift_weights] == 0)
D4ToArray(weights, (TYPE4)1.0, shift, 2, total * 2, 0);
//---
TYPE4 res = ToVect4(weights, shift, 2, total * 2, 0) * nx + ToVect4(weights, shift, 2, total * 2, 1);
//---
D4ToArray(options, mean, shift, 3, total * 3, 0);
D4ToArray(options, variance, shift, 3, total * 3, 1);
D4ToArray(options, nx, shift, 3, total * 3, 2);
D4ToArray(output, res, shift, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void BatchNormCalcHiddenGradient(__global TYPE *options,
__global TYPE *gradient,
__global TYPE *inputs,
__global TYPE *gradient_inputs,
__global TYPE *weights,
int batch,
int total
)
{
int n = get_global_id(0);
int shift = n * 4;
if(batch <= 1)
{
D4ToArray(gradient_inputs, ToVect4(gradient, shift, 1, total, 0), shift, 1, total, 0);
return;
}
//---
TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
TYPE4 gnx = ToVect4(gradient, shift, 1, total, 0) * ToVect4(weights, shift, 2, total * 2, 0);
TYPE4 temp = 1 / sqrt(ToVect4(options, shift, 3, total * 3, 1) + 1e-6);
TYPE4 delt = inp - ToVect4(options, shift, 3, total * 3, 0);
TYPE4 gvar = delt / (-2 * pow(ToVect4(options, shift, 3, total * 3, 1) + 1.0e-6, 3.0 / 2.0)) * gnx;
TYPE4 gmu = (-temp) * gnx - gvar * 2 * delt / (TYPE4)batch;
TYPE4 gx = temp * gnx + gmu / (TYPE4)batch + gvar * 2 * delt / (TYPE4)batch;
//---
D4ToArray(gradient_inputs, gx, shift, 1, total, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void BatchNormCalcDeltaWeights(__global TYPE *options,
__global TYPE *delta_weights,
__global TYPE *gradients)
{
const int n = get_global_id(0);
int shift_options = n * 3;
int shift_weights = n * 2;
//---
TYPE grad = gradients[n];
delta_weights[shift_weights] += grad * options[shift_options + 2];
delta_weights[shift_weights + 1] += grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskMult(__global TYPE *inputs,
__global TYPE *mask,
__global TYPE *outputs,
int outputs_total)
{
const int n = get_global_id(0) * 4;
//---
TYPE4 out = ToVect4(inputs, n, 1, outputs_total, 0) * ToVect4(mask, n, 1, outputs_total, 0);
D4ToArray(outputs, out, n, 1, outputs_total, 0);
}
//+------------------------------------------------------------------+