NN_in_Trading/Experts/NeuroNet_DNG/NeuroNet.cl
2026-03-16 15:49:23 +02:00

13756 satır
539 KiB
Common Lisp

/// \file
/// \brief NeuroNet.cl
/// Library consist OpenCL kernels
/// \author <A HREF="https://www.mql5.com/en/users/dng"> DNG </A>
/// \copyright Copyright 2019, DNG
//---
//--- by default some GPU doesn't support floats
//--- cl_khr_fp64 directive is used to enable work with floats
// #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define l1 1.0e-4f
#define l2 1.0e-4f
#define MAX_GRAD 1.0e-2f
#define LOCAL_ARRAY_SIZE 64
#define MAX_VALUE 3.4e37f
#define MIN_VALUE -MAX_VALUE
//--- Activation Functions
#define ActFunc_None -1
#define ActFunc_TANH 0
#define ActFunc_SIGMOID 1
#define ActFunc_LReLU 2
#define ActFunc_SoftPlus 3
#define ActFunc_GELU 4
#define ActFunc_MinusSoftPlus 5
#define ActFunc_ELU 6
//---
#define BarrierLoc barrier(CLK_LOCAL_MEM_FENCE);
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float IsNaNOrInf(const float value, const float def_value)
{
if(isnan(value) || isinf(value))
return def_value;
return value;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 IsNaNOrInf2(const float2 value, const float2 def_value)
{
if(isnan(value.x) || isinf(value.x) ||
isnan(value.y) || isinf(value.y))
return def_value;
return value;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float4 IsNaNOrInf4(const float4 value, const float def_value)
{
float4 result;
result.s0 = IsNaNOrInf(value.s0, def_value);
result.s1 = IsNaNOrInf(value.s1, def_value);
result.s2 = IsNaNOrInf(value.s2, def_value);
result.s3 = IsNaNOrInf(value.s3, def_value);
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float fActivation(const float value, const int function)
{
float result = IsNaNOrInf(value, 0);
switch(function)
{
case ActFunc_TANH:
result = tanh(clamp(result, -20.0f, 20.0f));
break;
case ActFunc_SIGMOID: //Sigmoid
result = 1 / (1 + exp(clamp(-result, -20.0f, 20.0f)));
break;
case ActFunc_LReLU: //LReLU
if(result < 0)
result *= 0.01f;
break;
case ActFunc_SoftPlus: //SoftPlus
result = (result >= 20.0f ? result : IsNaNOrInf(log(1 + exp(result)), 0));
break;
case ActFunc_GELU: //GELU
result = result / (1 + exp(clamp(-1.702f * result, -20.0f, 20.0f)));
break;
case ActFunc_MinusSoftPlus: // -SoftPlus
result = -fActivation(result, 3);
break;
case ActFunc_ELU: //ELU
if(result < 0)
result = IsNaNOrInf(exp(result), 0) - 1;
break;
default:
break;
}
//---
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float Deactivation(const float grad, const float inp_value, const int function)
{
float result = IsNaNOrInf(grad, 0);
//---
if(isnan(inp_value) || isinf(inp_value))
result = 0;
else
switch(function)
{
case ActFunc_TANH: //TANH
result = clamp(grad + inp_value, -1.0f, 1.0f) - inp_value;
result *= 1.0f - inp_value * inp_value;
break;
case ActFunc_SIGMOID: //Sigmoid
result = clamp(grad + inp_value, 0.0f, 1.0f) - inp_value;
result *= inp_value * (1.0f - inp_value);
break;
case ActFunc_LReLU: //LReLU
if(inp_value < 0)
result *= 0.01f;
break;
case ActFunc_SoftPlus: //SoftPlus
result *= (1.0f - exp(-inp_value));
break;
case ActFunc_GELU: //GELU
if(inp_value < 0.9f)
result *= fActivation(5 * inp_value, 1);
break;
case ActFunc_MinusSoftPlus: // -SoftPlus
result = Deactivation(-result, -inp_value, ActFunc_SoftPlus);
break;
case ActFunc_ELU: //ELU
if(inp_value < 0)
result *= inp_value + 1;
break;
default:
break;
}
//---
return clamp(IsNaNOrInf(result, 0), -MAX_GRAD, MAX_GRAD);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline int RCtoFlat(const int row,
const int col,
const int total_rows,
const int total_cols,
const int variable)
{
return (variable * total_rows + row) * total_cols + col;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float LocalMax(const float value, const int loc, __local float* Temp)
{
const size_t id = get_local_id(loc);
const size_t total = get_local_size(loc);
//---
const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE);
float val = IsNaNOrInf(value, MIN_VALUE);
//--- Look Max
if(id < ls)
Temp[id] = val;
BarrierLoc
for(int d = ls; d < total; d += ls)
{
if(id >= d && id < (d + ls) &&
(Temp[id - d] < val))
Temp[id - d] = val;
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(id < count && (id + count) < ls && Temp[id] < Temp[id + count])
Temp[id] = Temp[id + count];
BarrierLoc
}
while(count > 1);
//---
return Temp[0];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float LocalMin(const float value, const int loc, __local float* Temp)
{
const size_t id = get_local_id(loc);
const size_t total = get_local_size(loc);
//---
const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE);
float val = IsNaNOrInf(value, MAX_VALUE);
//--- Look Min
if(id < ls)
Temp[id] = val;
BarrierLoc
for(int d = ls; d < total; d += ls)
{
if(id >= d && id < (d + ls) &&
(Temp[id - d] > val))
Temp[id - d] = val;
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(id < count && (id + count) < ls && Temp[id] > Temp[id + count])
Temp[id] = Temp[id + count];
BarrierLoc
}
while(count > 1);
//---
return Temp[0];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float LocalSum(const float value, const int loc, __local float* Temp)
{
const size_t id = get_local_id(loc);
const size_t total = get_local_size(loc);
//---
if(total <= 1)
return IsNaNOrInf(value, 0.0f);
//---
const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE);
//--- Sum
float result = IsNaNOrInf(value, 0);
if(id < ls)
Temp[id] = result;
BarrierLoc
for(int d = ls; d < total; d += ls)
{
if(id >= d && id < (d + ls))
Temp[id - d] = Temp[id - d] + result;
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(id < count && (id + count) < ls)
{
Temp[id] += Temp[id + count];
Temp[id + count] = 0;
}
BarrierLoc
}
while(count > 1);
result = IsNaNOrInf(Temp[0], 0);
//---
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float4 LocalSum4(const float4 value, const int loc, __local float4* Temp)
{
const size_t id = get_local_id(loc);
const size_t total = get_local_size(loc);
//---
if(total <= 1)
return IsNaNOrInf4(value, 0.0f);
//---
const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE);
//---
float4 result = IsNaNOrInf4(value, 0.0f);
if(id < ls)
Temp[id] = result;
BarrierLoc
for(int d = ls; d < total; d += ls)
{
if(id >= d && id < (d + ls))
Temp[id - d] = Temp[id - d] + result;
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(id < count && (id + count) < ls)
{
Temp[id] += Temp[id + count];
Temp[id + count] = (float4)0.0f;
}
BarrierLoc
}
while(count > 1);
//---
result = IsNaNOrInf4(Temp[0], 0.0f);
//---
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float LocalSoftMax(const float value, const int loc, __local float* Temp)
{
//--- Look Max
float max = LocalMax(value, loc, Temp);
if(max == MIN_VALUE)
return 0.0f;
//--- SoftMax
float result = (value == MIN_VALUE ? 0.0f : IsNaNOrInf(exp(value - max), 0.0f));
const float sum = LocalSum(result, loc, Temp);
if(sum == 0.0f)
result = 0;
else
result = IsNaNOrInf(result / sum, 0.0f);
//---
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float LocalSoftMaxGrad(const float value, const float grad, const int loc, __local float* Temp)
{
const float y = IsNaNOrInf(value, 0.0f);
const float g = IsNaNOrInf(grad, 0.0f);
const float s = LocalSum(y * g, loc, Temp);
//--- d_i = y_i * (g_i - sum_j(y_j * g_j))
return IsNaNOrInf(y * (g - s), 0.0f);
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_ff Feed forward process kernel
/// Describes the forward path process for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8435#para41">the link.</A>
//+------------------------------------------------------------------+
__kernel void FeedForward(__global const float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number
///< of neurons in layer and n - number of outputs
///< (neurons in next layer)
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_o, ///<[out] Output tensor
const int inputs, ///< Number of inputs
const int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
const int i = get_global_id(0);
const int total_out = get_global_size(0);
const int loc = get_local_id(1);
const int total_loc = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
float sum = 0;
float inp;
int shift = RCtoFlat(i, 0, total_out, (inputs + 1), 0);
for(int k = loc; k < inputs; k += total_loc)
{
inp = IsNaNOrInf(matrix_i[k], 0.0f);
if(inp == 0.0f)
continue;
sum += IsNaNOrInf(inp * matrix_w[shift + k], 0.0f);
}
if(loc == 0)
sum += IsNaNOrInf(matrix_w[shift + inputs], 0.0f);
if(total_loc > 1)
sum = LocalSum(sum, 1, Temp);
//---
if(loc == 0)
matrix_o[i] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr Neuron Base Output Gradients Calculation kernel
/// Describes the process of output gradients calculation for the Neuron Base
/// (#CNeuronBaseOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8435#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void CalcOutputGradient(__global float *matrix_t, ///<[in] Target tensor
__global float *matrix_o, ///<[in] Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients
int activation, ///< Activation type (#ENUM_ACTIVATION)
float error)
{
int i = get_global_id(0);
float out = matrix_o[i];
float temp = 0;
if(!isnan(out) && !isinf(out))
temp = Deactivation(matrix_t[i] - out, out, activation);
matrix_ig[i] = temp;
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr Neuron Base Hidden Gradients Calculation kernel
/// Describes the process of hidden gradients calculation for the Neuron Base
/// (#CNeuronBaseOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8435#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradient(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number
///< of neurons in previous layer and n - number
///< of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Previous layer Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
const int i = get_global_id(0);
const int inputs = get_global_size(0);
const int loc = get_local_id(1);
const int total_loc = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
float sum = 0;
float out = matrix_o[i];
float4 grad, weight;
for(int k = 4 * loc; k < outputs; k += 4 * total_loc)
{
switch(outputs - k)
{
case 1:
weight = (float4)(matrix_w[k * (inputs + 1) + i], 0, 0, 0);
grad = (float4)(matrix_g[k], 0, 0, 0);
break;
case 2:
grad = (float4)(matrix_g[k], matrix_g[k + 1], 0, 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i], 0, 0);
break;
case 3:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i],
matrix_w[(k + 2) * (inputs + 1) + i], 0);
break;
default:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2],
matrix_g[k + 3]);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i],
matrix_w[(k + 2) * (inputs + 1) + i],
matrix_w[(k + 3) * (inputs + 1) + i]);
break;
}
//---
weight = IsNaNOrInf4(weight, 0);
grad = IsNaNOrInf4(grad, 0);
//---
sum += dot(grad, weight);
}
if(total_loc > 1)
sum = LocalSum(sum, 1, Temp);
//---
matrix_ig[i] = Deactivation(sum, out, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base SGD Updating Weights Calculation kernel
/// Describes the process of SGD optimization weights for the Neuron Base
/// (#CNeuronBaseOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8435#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum ///< Momentum multiplier
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int wi = i * (inputs + 1) + j;
float grad = clamp(matrix_g[i], -MAX_GRAD, MAX_GRAD);
float delta = IsNaNOrInf(learning_rates * grad * (j < inputs ? matrix_i[j] : 1), 0) +
IsNaNOrInf(momentum * matrix_dw[wi], 0);
matrix_dw[wi] = delta;
if(fabs(delta) > 0)
matrix_w[wi] = IsNaNOrInf(matrix_w[wi] + delta, 0);
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base Adam Updating Weights Calculation
/// kernel
/// Describes the process of Adam optimization weights for the Neuron Base
/// (#CNeuronBaseOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8598#para31">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float
*matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = i * (inputs + 1) + j;
float m, v, weight, inp;
inp = IsNaNOrInf((j == inputs ? 1.0f : matrix_i[j]), 0);
weight = IsNaNOrInf(matrix_w[wi], 0);
m = IsNaNOrInf(matrix_m[wi], 0);
v = IsNaNOrInf(matrix_v[wi], 0);
//---
float g = clamp(IsNaNOrInf(matrix_g[i] * inp, 0), -MAX_GRAD, MAX_GRAD);
float mt = IsNaNOrInf(b1 * m + (1 - b1) * g, 0);
float vt = IsNaNOrInf(b2 * v + (1 - b2) * (g * g), 0);
float delta =
IsNaNOrInf(l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)), 0);
if(fabs(delta) > 0)
matrix_w[wi] = IsNaNOrInf(matrix_w[wi] + delta, 0);
matrix_m[wi] = mt;
matrix_v[wi] = vt;
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base Least Squares Updating Weights
/// Calculation kernel
/// Describes the process of Least Squares optimization weights for the Neuron
/// Base (#CNeuronBaseOCL).
//\details Detailed description on <A
// HREF="https://www.mql5.com/ru/articles/8598#para31">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateWeightsLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float
*matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_xg, ///<[in,out] Matrix of summ x*g
__global float *matrix_xx, ///<[in,out] Matrix of summ x*x
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const int update ///< Update flag
)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = i * (inputs + 1) + j * 4;
float4 xg, xx, weight, inp;
switch(inputs + 1 - j * 4)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[wi], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[j * 4], 1, 0, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], 1, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0);
break;
case 3:
inp =
(float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], 1);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2],
matrix_w[wi + 3]);
break;
default:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2],
matrix_i[j * 4 + 3]);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2],
matrix_w[wi + 3]);
break;
}
xg = (float4)(matrix_g[i]) * inp;
xx = inp * inp;
switch(min(inputs + 1 - j * 4, 3))
{
case 3:
if(update)
{
matrix_w[wi + 3] =
matrix_w[wi + 3] + l * (matrix_xg[wi + 3] + xg.s3) /
(matrix_xx[wi + 3] + xx.s3 + 1.0e-37f);
matrix_xg[wi + 3] = 0;
matrix_xx[wi + 3] = 0;
}
else
{
matrix_xg[wi + 3] += xg.s3;
matrix_xx[wi + 3] += xx.s3;
}
case 2:
if(update)
{
matrix_w[wi + 2] =
matrix_w[wi + 2] + l * (matrix_xg[wi + 2] + xg.s2) /
(matrix_xx[wi + 2] + xx.s2 + 1.0e-37f);
matrix_xg[wi + 2] = 0;
matrix_xx[wi + 2] = 0;
}
else
{
matrix_xg[wi + 2] += xg.s2;
matrix_xx[wi + 2] += xx.s2;
}
case 1:
if(update)
{
matrix_w[wi + 1] =
matrix_w[wi + 1] + l * (matrix_xg[wi + 1] + xg.s1) /
(matrix_xx[wi + 1] + xx.s1 + 1.0e-37f);
matrix_xg[wi + 1] = 0;
matrix_xx[wi + 1] = 0;
}
else
{
matrix_xg[wi + 1] += xg.s1;
matrix_xx[wi + 1] += xx.s1;
}
case 0:
if(update)
{
matrix_w[wi] = matrix_w[wi] + l * (matrix_xg[wi] + xg.s0) /
(matrix_xx[wi] + xx.s0 + 1.0e-37f);
matrix_xg[wi] = 0;
matrix_xx[wi] = 0;
}
else
{
matrix_xg[wi] += xg.s0;
matrix_xx[wi] += xx.s0;
}
break;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_proof_ff
/// Kernel of the Pooling neuron for Feed forward process (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardProof(__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_o, ///<[out] Output tensor
int inputs, ///< Number of inputs
int window, ///< Size of input window
int step ///< Step size
)
{
int i = get_global_id(0);
int pos = i * step;
float result = matrix_i[pos];
//---
for(int k = 1; k < window; k++)
{
int shift = k + pos;
if(shift >= inputs)
break;
result = max(result, matrix_i[shift]);
}
matrix_o[i] = result;
}
//+------------------------------------------------------------------+
///\ingroup neuron_proof_gr
/// Kernel of the Pooling neuron to transfer gradient to previous layer
/// (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void CalcInputGradientProof(__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int window, ///< Size of input window
int step ///< Step size
)
{
int i = get_global_id(0);
float prev_gradient = 0;
float value = matrix_i[i];
int start = i - window + step;
start = (start - start % step) / step;
int stop = (i - i % step) / step + 1;
for(int out = max(0, start); out < min(outputs, stop); out++)
{
if(value == matrix_o[out])
prev_gradient += matrix_g[out];
}
matrix_ig[i] = prev_gradient;
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_ff
/// Kernel of the Convolution neuron for Feed forward process (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardConv(__global const float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_o, ///<[out] Output tensor
const int inputs, ///< Number of inputs
const int step, ///< Step size
const int window_in, ///< Size of input window
const int window_out, ///< Size of output window
const int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
const size_t i = get_global_id(0);
const int out = get_global_id(1);
const size_t v = get_global_id(2);
const size_t outputs = get_global_size(0);
//---
const int shift_out = window_out * i;
const int shift_in = step * i;
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * window_out * outputs;
const int shift_var_w = v * window_out * (window_in + 1);
//---
float sum = 0;
float inp;
//---
int shift = (window_in + 1) * out;
int stop = (window_in <= (inputs - shift_in) ? window_in : (inputs - shift_in));
for(int k = 0; k < stop; k ++)
{
inp = IsNaNOrInf(matrix_i[shift_var_in + shift_in + k], 0.0f);
if(inp == 0.0f)
continue;
sum += IsNaNOrInf(inp * matrix_w[shift_var_w + shift + k], 0.0f);
}
sum += IsNaNOrInf(matrix_w[shift_var_w + shift + window_in], 0.0f);
//---
matrix_o[shift_var_out + out + shift_out] = fActivation(sum, activation);;
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_gr
/// Kernel of the Convolution neuron to transfer gradient
/// to previous layer (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientConv(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
const int outputs, ///< Number of outputs
const int step, ///< Step size
const int window_in, ///< Size of input window
const int window_out, ///< Size of output window
const int activation, ///< Activation type (#ENUM_ACTIVATION)
const int shift_out ///< Shift in output and gradient buffer
)
{
const size_t i = get_global_id(0);
const size_t inputs = get_global_size(0);
const size_t v = get_global_id(1);
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * outputs;
const int shift_var_w = v * window_out * (window_in + 1);
//---
float sum = 0;
float out = matrix_o[shift_var_in + i];
const int w_start = i % step;
const int start = max((int)((i - window_in + step) / step), 0);
int stop = (w_start + step - 1) / step;
stop = min((int)((i + step - 1) / step + 1), stop) + start;
if(stop > (outputs / window_out))
stop = outputs / window_out;
for(int h = 0; h < window_out; h ++)
{
for(int k = start; k < stop; k++)
{
int shift_g = k * window_out + h;
int shift_w = (stop - k - 1) * step + i % step + h * (window_in + 1);
if(shift_g >= outputs || shift_w >= (window_in + 1) * window_out)
break;
float grad = IsNaNOrInf(matrix_g[shift_out + shift_g + shift_var_out], 0.0f);
if(fabs(grad) > 0.0f)
sum += IsNaNOrInf(grad * matrix_w[shift_w + shift_var_w], 0.0f);
}
}
//---
matrix_ig[shift_var_in + i] = Deactivation(sum, out, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron SGD optimization Updating Weights
/// Calculation kernel
/// Describes the process of SGD optimization weights for the Convolution Neuron
/// (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Inputs tensor
__global float
*matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum, ///< Momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const size_t i = get_global_id(0);
//---
const int v = i / ((window_in + 1) * window_out);
const int shift = i % (window_in + 1);
const int shift_out = i / (window_in + 1) - v;
const int total = (inputs - window_in + step - 1) / step;
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * total * window_out;
//---
float grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(shift != window_in && (shift + t * window_in) >= inputs)
break;
grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] *
(shift == window_in ? 1 : matrix_i[shift + t * step + shift_var_in]),
0.0f);
}
float delta = IsNaNOrInf(learning_rates * grad, 0) + momentum * matrix_dw[i];
if(!isnan(delta))
{
matrix_dw[i] = delta;
if(fabs(delta) > 0)
matrix_w[i] = IsNaNOrInf(matrix_w[i] + delta, 0);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating
/// Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Convolution
/// Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in] Matrix of first momentum
__global float *matrix_v, ///<[in] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const size_t i = get_global_id(0);
//---
const int v = i / ((window_in + 1) * window_out);
const int shift = i % (window_in + 1);
const int shift_out = i / (window_in + 1) - v * window_out;
const int total = (inputs - (window_in - step) + (step - 1)) / step;
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * total * window_out;
//---
float grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(shift != window_in && (shift + t * window_in) >= inputs)
break;
grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] *
(shift == window_in ? 1 : matrix_i[shift + t * step + shift_var_in]), 0);
}
grad = clamp(IsNaNOrInf(grad, 0), -MAX_GRAD, MAX_GRAD);
float mt = IsNaNOrInf(b1 * matrix_m[i] + (1 - b1) * grad, 0);
float vt = IsNaNOrInf(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0f);
float weight = IsNaNOrInf(matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0), 0);
matrix_w[i] = weight;
matrix_m[i] = mt;
matrix_v[i] = vt;
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Least Squares optimization
/// Updating Weights Calculation kernel
/// Describes the process of Least Squares optimization weights for the
/// Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global const float
*matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_xg, ///<[in] Matrix of summ x*g
__global float *matrix_xx, ///<[in] Matrix of summ x*x
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const int update, ///< Update flag
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const int i = get_global_id(0);
if(i > window_in)
return;
//---
int total = (inputs - (window_in - step)) % step;
total = (inputs - (window_in - step) - total) / step + (total > 0 ? 1 : 0);
//---
for(int out = 0; out < window_out; out++)
{
if((window_out - out) > 4)
{
float4 xg = {0, 0, 0, 0};
float x2 = 0;
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
xg += (float4)(matrix_g[t * window_out + out],
matrix_g[t * window_out + out + 1],
matrix_g[t * window_out + out + 2],
matrix_g[t * window_out + out + 3]) *
(i == window_in ? 1 : matrix_i[i + t * step]);
x2 += (i == window_in ? 1 : matrix_i[i + t * step] * matrix_i[i + t * step]);
}
if(update)
{
xg = (float4)(matrix_xg[shift_w], matrix_xg[shift_w + window_in + 1],
matrix_xg[shift_w + 2 * (window_in + 1)],
matrix_xg[shift_w + 3 * (window_in + 1)]) +
xg;
float4 xx =
(float4)(matrix_xx[shift_w], matrix_xx[shift_w + window_in + 1],
matrix_xx[shift_w + 2 * (window_in + 1)],
matrix_xx[shift_w + 3 * (window_in + 1)]) +
x2;
float4 delta = l * xg / (xx + 1.0e-37f);
float4 weight =
(float4)(matrix_w[shift_w], matrix_w[shift_w + (window_in + 1)],
matrix_w[shift_w + 2 * (window_in + 1)],
matrix_w[shift_w + 3 * (window_in + 1)]) +
delta;
matrix_w[shift_w] = weight.s0;
matrix_w[shift_w + (window_in + 1)] = weight.s1;
matrix_w[shift_w + 2 * (window_in + 1)] = weight.s2;
matrix_w[shift_w + 3 * (window_in + 1)] = weight.s3;
matrix_xg[shift_w] = 0;
matrix_xg[shift_w + (window_in + 1)] = 0;
matrix_xg[shift_w + 2 * (window_in + 1)] = 0;
matrix_xg[shift_w + 3 * (window_in + 1)] = 0;
matrix_xx[shift_w] = 0;
matrix_xx[shift_w + (window_in + 1)] = 0;
matrix_xx[shift_w + 2 * (window_in + 1)] = 0;
matrix_xx[shift_w + 3 * (window_in + 1)] = 0;
}
else
{
matrix_xg[shift_w] += xg.s0;
matrix_xg[shift_w + (window_in + 1)] += xg.s1;
matrix_xg[shift_w + 2 * (window_in + 1)] += xg.s2;
matrix_xg[shift_w + 3 * (window_in + 1)] += xg.s3;
matrix_xx[shift_w] = matrix_xx[shift_w + (window_in + 1)] =
matrix_xx[shift_w + 2 * (window_in + 1)] =
matrix_xx[shift_w + 3 * (window_in + 1)] += x2;
}
out += 3;
}
else
{
float xg = 0;
float xx = 0;
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
xg += matrix_g[t * window_out + out] *
(i == window_in ? 1 : matrix_i[i + t * step]);
xx += (i == window_in ? 1 : matrix_i[i + t * step] * matrix_i[i + t * step]);
}
if(update)
{
xg = matrix_xg[shift_w] + xg;
xx = matrix_xx[shift_w] + xx;
float delta = l * xg / (xx + 1.0e-37f);
matrix_w[shift_w] = matrix_w[shift_w] + delta;
matrix_xg[shift_w] = 0;
matrix_xx[shift_w] = 0;
}
else
{
matrix_xg[shift_w] += xg;
matrix_xx[shift_w] += xx;
}
}
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Score calculation kernel |
/// Describes the Score calculation process for the Neuron of attention layer
/// (#CNeuronAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void AttentionScore(__global float *querys, ///<[in] Matrix of Querys
__global float *keys, ///<[in] Matrix of Keys
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
int q = get_global_id(0);
int shift_q = q * dimension;
int units = get_global_size(0);
int shift_s = q * units;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0;
//---
for(int k = 0; k < units; k++)
{
if(mask > 0 && k > q)
{
score[shift_s + k] = 0;
continue;
}
float result = 0;
int shift_k = k * dimension;
for(int i = 0; i < dimension; i++)
result += (querys[shift_q + i] * keys[shift_k + i]);
result = IsNaNOrInf(exp(result / koef), 0);
score[shift_s + k] = result;
sum += result;
}
//---
for(int k = 0; (k < units && sum > 0); k++)
score[shift_s + k] /= sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Out calculation kernel
/// Describes the Attention out calculation process for the Neuron of attention
/// layer (#CNeuronAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void AttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *values, ///<[in] Matrix of Values
__global float *inputs, ///<[in] Inputs tensor
__global float *out ///<[out] Output tensor
)
{
int units = get_global_size(0);
int u = get_global_id(0);
int d = get_global_id(1);
int dimension = get_global_size(1);
int shift = u * dimension + d;
float result = 0;
//---
for(int i = 0; i < units; i++)
result += IsNaNOrInf(scores[u * units + i], 0) * IsNaNOrInf(values[i * dimension + d], 0);
out[shift] = IsNaNOrInf(result, 0) + inputs[shift];
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 2 matrixs with
/// multiplyer.
/// Describes the calculation Sum of 2 matrixs.
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void SumMatrix(__global float *matrix1, ///<[in] First matrix
__global float *matrix2, ///<[in] Second matrix
__global float *matrix_out, ///<[out] Output matrix
int dimension, ///< Dimension of matrix
float multiplyer, ///< Multiplyer for output
int shift_in1, ///< Shift for input 1
int shift_in2, ///< Shift for input 2
int shift_out ///< Shift for output
)
{
const int i = get_global_id(0);
const int step = get_global_size(0);
//---
for(int k = 0; k < dimension; k++)
{
int index = i * dimension + k;
matrix_out[i * shift_out + index] =
IsNaNOrInf((matrix1[i * shift_in1 + index] + matrix2[i * shift_in2 + index]) * multiplyer, 0);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 4 matrixs with
/// multiplyer.
/// Describes the calculation Sum of 4 matrixs.
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8909#para53">the link.</A>
//+------------------------------------------------------------------+
__kernel void Sum5Matrix(__global float *matrix1, ///<[in] First matrix
__global float *matrix2, ///<[in] Second matrix
__global float *matrix3, ///<[in] Third matrix
__global float *matrix4, ///<[in] Fourth matrix
__global float *matrix5, ///<[in] Fifth matrix
__global float *matrix_out, ///<[out] Output matrix
int dimension, ///< Dimension of matrix
float multiplyer ///< Multiplyer for output
)
{
const int i = get_global_id(0) * dimension;
//---
for(int k = 0; k < dimension; k++)
matrix_out[i + k] = (matrix1[i + k] + matrix2[i + k] + matrix3[i + k] +
matrix4[i + k] + matrix5[i + k]) *
multiplyer;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation
/// kernel
/// Describes the gradients calculation process for the Neuron of attention
/// layer (#CNeuronAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8765#para44">the link.</A>
/// @param[in] querys Matrix of Querys
/// @param[out] querys_g Matrix of Querys' Gradients
/// @param[in] keys Matrix of Keys
/// @param[out] keys_g Matrix of Keys' Gradients
/// @param[in] values Matrix of Values
/// @param[out] values_g Matrix of Values' Gradients
/// @param[in] scores Matrix of Scores
/// @param[in] gradient Matrix of Gradients from previous iteration
//+------------------------------------------------------------------+
__kernel void AttentionInsideGradients(__global float *querys, __global float *querys_g,
__global float *keys, __global float *keys_g,
__global float *values, __global float *values_g,
__global float *scores, __global float *gradient)
{
int u = get_global_id(0);
int d = get_global_id(1);
int units = get_global_size(0);
int dimension = get_global_size(1);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float vg = 0;
float qg = 0;
float kg = 0;
//---
for(int iu = 0; iu < units; iu++)
{
float g = gradient[iu * dimension + d];
float sc = scores[iu * units + u];
vg += sc * g;
//---
float sqg = 0;
float skg = 0;
for(int id = 0; id < dimension; id++)
{
sqg += values[iu * dimension + id] * gradient[u * dimension + id];
skg += values[u * dimension + id] * gradient[iu * dimension + id];
}
qg += (scores[u * units + iu] == 0 || scores[u * units + iu] == 1
? 0.0001f
: scores[u * units + iu] * (1 - scores[u * units + iu])) *
sqg * keys[iu * dimension + d] / koef;
//---
kg += (scores[iu * units + u] == 0 || scores[iu * units + u] == 1
? 0.0001f
: scores[iu * units + u] * (1 - scores[iu * units + u])) *
skg * querys[iu * dimension + d] / koef;
}
int shift = u * dimension + d;
values_g[shift] = clamp(IsNaNOrInf(vg, 0.0f), -1.0f, 1.0f);
querys_g[shift] = clamp(IsNaNOrInf(qg, 0.0f), -1.0f, 1.0f);
keys_g[shift] = clamp(IsNaNOrInf(kg, 0.0f), -1.0f, 1.0f);
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of matrix normalization process
/// Describes the process of matrix normalization.
///\details Detailed description on <A
/// HREF="https://arxiv.org/abs/1607.06450">the link.</A>
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void Normalize(__global float *buffer, int dimension)
{
int n = get_global_id(0);
int shift = n * dimension;
if(dimension < 1)
return;
//---
float mean = 0;
float M2 = 0;
float variance = 0;
//---
for(int i = 0; i < dimension; i++)
{
float val = IsNaNOrInf(buffer[shift + i], 0);
double delta = val - mean;
mean += delta / (i + 1);
M2 += delta * (val - mean);
}
variance = M2 / (dimension - 1);
//---
for(int i = 0; i < dimension; i++)
if(variance > 1)
buffer[shift + i] =
IsNaNOrInf((buffer[shift + i] - mean) / variance, 0);
else
buffer[shift + i] =
IsNaNOrInf(buffer[shift + i] - mean, 0);
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of weights matrix normalization process
/// Describes the process of weights matrix normalization.
///\details Detailed description on <A
/// HREF="https://arxiv.org/abs/1607.06450">the link.</A>
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void NormalizeWeights(__global float *buffer, int dimension)
{
int n = get_global_id(0);
int shift = n * dimension;
float sum = 0;
float k = 1;
//---
do
{
for(int i = 0; (i < dimension && !(isnan(sum) || !isinf(sum))); i++)
{
float normalized = IsNaNOrInf(buffer[shift + i], 0) / k;
sum = normalized * normalized / dimension;
}
if(isnan(sum) || isinf(sum))
k *= 10;
}
while(isnan(sum) || isinf(sum));
sum = sqrt(sum);
if(k * sum > 1)
for(int i = 0; i < dimension; i++)
buffer[shift + i] = IsNaNOrInf(buffer[shift + i], 0) / (k * sum);
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff
/// Describes the process of concatenate 4 matrices.
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8909#para52">the link.</A>
/// @param[in] input1, input2, input3, input4 Input buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] output Output buffer
//+------------------------------------------------------------------+
__kernel void ConcatenateBuffers(__global float *input1, int window1,
__global float *input2, int window2,
__global float *input3, int window3,
__global float *input4, int window4,
__global float *output)
{
int n = get_global_id(0);
int shift = n * (window1 + window2 + window3 + window4);
int shift_in = n * window1;
for(int i = 0; i < window1; i++)
output[shift + i] = IsNaNOrInf(input1[shift_in + i], 0);
//---
shift += window1;
shift_in = n * window2;
//---
for(int i = 0; i < window2; i++)
output[shift + i] = IsNaNOrInf(input2[shift_in + i], 0);
//---
shift += window2;
shift_in = n * window3;
//---
for(int i = 0; i < window3; i++)
output[shift + i] = IsNaNOrInf(input3[shift_in + i], 0);
//---
shift += window3;
shift_in = n * window4;
//---
for(int i = 0; i < window4; i++)
output[shift + i] = IsNaNOrInf(input4[shift_in + i], 0);
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr
/// Describes the process of deconcatenate matrix.
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/8909#para53">the link.</A>
/// @param[in] output1, output2, output3, output4 Output buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] inputs Input buffer
//+------------------------------------------------------------------+
__kernel void DeconcatenateBuffers(__global float *output1, int window1,
__global float *output2, int window2,
__global float *output3, int window3,
__global float *output4, int window4,
__global float *inputs)
{
int n = get_global_id(0);
//--- Head 1
int shift = n * (window1 + window2 + window3 + window4);
int shift_out = n * window1;
//---
for(int i = 0; i < window1; i++)
output1[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0);
//--- Head 2
shift += window1;
shift_out = n * window2;
//---
for(int i = 0; i < window2; i++)
output2[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0);
//--- Head 3
shift += window2;
if(window3 > 0)
{
shift_out = n * window3;
//---
for(int i = 0; i < window3; i++)
output3[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0);
}
//--- Head 4
shift += window3;
if(window4 > 0)
{
shift_out = n * window4;
//---
for(int i = 0; i < window4; i++)
output4[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Multi-Heads Attention Neuron Score calculation
/// kernel
/// Describes the Score calculation process for the Neuron of multi-heads
/// attention layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9025#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void MHAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
int q = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
//---
int shift_q = dimension * (h + 3 * q * heads);
int shift_s = units * (h + q * heads);
//---
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0;
//---
for(int k = 0; k < units; k++)
{
if(mask > 0 && k > q)
{
score[shift_s + k] = 0;
continue;
}
float result = 0;
int shift_k = dimension * (h + heads * (3 * k + 1));
for(int i = 0; i < dimension; i++)
{
if((dimension - i) > 4)
{
result += dot(IsNaNOrInf4((float4)(qkv[shift_q + i], qkv[shift_q + i + 1],
qkv[shift_q + i + 2], qkv[shift_q + i + 3]), 0),
IsNaNOrInf4((float4)(qkv[shift_k + i], qkv[shift_k + i + 1],
qkv[shift_k + i + 2], qkv[shift_k + i + 3]), 0));
i += 3;
}
else
result += IsNaNOrInf(qkv[shift_q + i] * qkv[shift_k + i], 0);
}
result = exp(clamp(result / koef, -100.0f, 100.0f));
if(isnan(result))
result = 0;
score[shift_s + k] = result;
sum += result;
}
//---
for(int k = 0; (k < units && sum > 1); k++)
score[shift_s + k] /= sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Multi-heads Attention Neuron Out calculation kernel
/// Describes the Multi-heads Attention out calculation process for the Neuron
/// of multi-heads attention layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9025#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void MHAttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *qkv, ///<[in] Matrix of Values
__global float *out, ///<[out] Output tensor
int dimension ///< Dimension of Value
)
{
int u = get_global_id(0);
int units = get_global_size(0);
int h = get_global_id(1);
int heads = get_global_size(1);
//---
int shift_s = units * (h + heads * u);
int shift_out = dimension * (h + heads * u);
int layer = 3 * dimension * heads;
//---
//---
for(int d = 0; d < dimension; d++)
{
float result = 0;
for(int v = 0; v < units; v++)
{
int shift_v = dimension * (h + heads * (3 * v + 2)) + d;
result += scores[shift_s + v] * qkv[shift_v];
}
out[shift_out + d] = result;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation
/// kernel
/// Describes the gradients calculation process for the Neuron of attention
/// layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9025#para33">the link.</A>
/// @param[in] qkv Matrix of Querys, Keys and Values
/// @param[out] qkv_g Matrix of Querys', Keys' and Values' Gradients
/// @param[in] scores Matrix of Scores
/// @param[in] scores_g Matrix of Scores' Gradients
/// @param[in] gradient Matrix of Gradients from previous iteration
/// @param[in] dimension Dimension of Key vector
//+------------------------------------------------------------------+
__kernel void MHAttentionInsideGradients(__global float *qkv, __global float *qkv_g,
__global float *scores,
__global float *gradient)
{
size_t u = get_global_id(0);
size_t h = get_global_id(1);
size_t d = get_global_id(2);
size_t units = get_global_size(0);
size_t heads = get_global_size(1);
size_t dimension = get_global_size(2);
//---
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- init
const int shift_q = dimension * (heads * 3 * u + h);
const int shift_k = dimension * (heads * (3 * u + 1) + h);
const int shift_v = dimension * (heads * (3 * u + 2) + h);
const int shift_g = dimension * (heads * u + h);
int shift_score = h * units;
int step_score = units * heads;
//--- Calculating Value's gradients
float sum = 0;
//---
for(int i = 0; i < units; i++)
sum += gradient[(h + i * heads) * dimension + d] * scores[shift_score + u + i * step_score];
qkv_g[shift_v + d] = sum;
//--- Calculating Query's gradients
shift_score = h * units + u * step_score;
float grad = 0;
float grad_out = gradient[shift_g + d];
//---
for(int k = 0; k < units; k++)
{
float sc_g = 0;
float sc = scores[shift_score + k];
for(int v = 0; v < units; v++)
sc_g += scores[shift_score + v] * qkv[dimension * (heads * (3 * v + 2) + h)] *
grad_out * ((k == v) - sc);
grad += sc_g / koef * qkv[dimension * (heads * (3 * k + 1) + h) + d];
}
qkv_g[shift_q + d] = grad;
//--- Calculating Key's gradients
grad = 0;
//---
for(int q = 0; q < units; q++)
{
shift_score = h * units + q * step_score;
float sc_g = 0;
float sc = scores[shift_score + u];
float grad_out = gradient[dimension * (heads * q + h) + d];
for(int v = 0; v < units; v++)
sc_g += scores[shift_score + v] * qkv[dimension * (heads * (3 * v + 2) + h)] *
grad_out * ((u == v) - sc);
grad += sc_g / koef * qkv[dimension * (heads * 3 * q + h) + d];
}
qkv_g[shift_k + d] = grad;
}
//+------------------------------------------------------------------+
///\ingroup neuron_dropout Kernel for Dropout.
/// Describes the dropout method.
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9112#para32">the link.</A>
//+------------------------------------------------------------------+
__kernel void Dropout(__global const float *inputs, ///<[in] Input matrix
__global const float *map, ///<[in] Dropout map matrix
__global float *out, ///<[out] Output matrix
const int dimension ///< Dimension of matrix
)
{
const int i = get_global_id(0) * 4;
float m = 0, inp = 0;
//---
for(int k = i; k < min(dimension, i + 4); k++)
{
m = IsNaNOrInf(map[i + k], 0.0f);
if(m == 0)
{
out[i + k] = 0;
continue;
}
inp = IsNaNOrInf(inputs[i + k], 0.0f);
if(inp == 0)
{
out[i + k] = 0;
continue;
}
out[i + k] = IsNaNOrInf(inp * m, 0);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of Batch normalization process
/// Describes the process of Batch normalization. (#CNeuronBatchNormOCL)
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9207#para42">the link.</A>
/// @param[in] inputs Input data tenzor
/// @param[in,out] options Tenzor of variables
/// @param[out] output Tenzor of output data
/// @param[in] batch Batch size
/// @param[in] optimization Optimization type
/// @param[in] activation Activation type
//+------------------------------------------------------------------+
__kernel void BatchFeedForward(__global float *inputs, __global float *options,
__global float *output, int batch,
int optimization, int activation)
{
if(batch <= 1)
return;
int n = get_global_id(0);
int shift = n * (optimization == 0 ? 7 : 9);
//---
float inp = IsNaNOrInf(inputs[n], 0);
float mean = (IsNaNOrInf(options[shift], 0) * max((float)batch - 1.0f, 0.0f) + inp) / max((float)batch, 1.0f);
float delt = inp - mean;
float variance = (IsNaNOrInf(options[shift + 1] * max((float)batch - 1.0f, 0.0f), 0) + delt * delt) / max((float)batch, 1.0f);
if(variance <= 0)
variance = 1;
float nx = delt / sqrt(variance);
//---
float gamma = IsNaNOrInf(options[shift + 3], 1);
if(gamma == 0)
{
options[shift + 3] = 1;
gamma = 1;
}
float betta = IsNaNOrInf(options[shift + 4], 0);
//---
options[shift] = mean;
options[shift + 1] = variance;
options[shift + 2] = nx;
output[n] = fActivation(gamma * nx + betta, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_gr
/// Kernel of the Batch neuron to transfer gradient to previous layer
/// (#CNeuronBatchNormOCL)
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9207#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientBatch(__global float *options, ///<[in] Options matrix m*(7 or 9), where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Tensor of previous layer output
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int activation, ///< Activation type (#ENUM_ACTIVATION)
int batch, ///< Batch size
int optimization ///< Optimization type
)
{
if(batch <= 1)
return;
//---
int n = get_global_id(0);
int shift = n * (optimization == 0 ? 7 : 9);
//---
float variance = IsNaNOrInf(options[shift + 1], 1);
//---
float inp = IsNaNOrInf(matrix_i[n], 0);
float gnx = IsNaNOrInf(matrix_g[n], 0) * IsNaNOrInf(options[shift + 3], 1);
float temp = (variance > 0 ? 1.0f / sqrt(variance) : 0);
float gmu = (-temp) * gnx;
float gvar =
(variance > 0
? (IsNaNOrInf(options[shift], 0) * inp) / (2 * pow(variance, 3.0f / 2.0f)) * gnx
: 0);
float batch_ratio = max((float)(batch - 1), 0.0f) / max((float)batch, 1.0f);
float gx = temp * gnx + gmu / max(batch, 1) +
gvar * 2 * inp / max(batch, 1) * batch_ratio * batch_ratio;
//---
matrix_ig[n] = Deactivation(gx, inp, activation);;
}
//+------------------------------------------------------------------+
///\ingroup neuron_opt Batch normalization Neuron SGD optimization Updating
/// options kernel
/// Describes the process of SGD optimization options for the Batch
/// normalization Neuron (#CNeuronBatchNormOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9207#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateBatchOptionsMomentum(__global float *options, ///<[in,out] Options matrix m*7, where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
float learning_rates, ///< Learning rates
float momentum ///< Momentum multiplier
)
{
const int n = get_global_id(0);
const int inputs = get_global_size(0);
const int shift = n * 7;
float grad = clamp(IsNaNOrInf(matrix_g[n], 0), -MAX_GRAD, MAX_GRAD);
//---
float2 delta = learning_rates * grad * (float2)(IsNaNOrInf(options[shift + 2], 0), 1) +
momentum * (float2)(IsNaNOrInf(options[shift + 5], 0), IsNaNOrInf(options[shift + 6], 0));
//---
delta.s0 = IsNaNOrInf(delta.s0, 0);
delta.s1 = IsNaNOrInf(delta.s1, 0);
options[shift + 5] = delta.s0;
float value = IsNaNOrInf(options[shift + 3], 1);
options[shift + 3] = value + delta.s0 - learning_rates * (l1 * sign(value) +
l2 * value / inputs);
//---
options[shift + 6] = delta.s1;
value = IsNaNOrInf(options[shift + 4], 0);
options[shift + 4] = value + delta.s1 - learning_rates * (l1 * sign(value) +
l2 * value / inputs);
}
//+------------------------------------------------------------------+
///\ingroup neuron_opt Batch normalization Neuron Adam optimization Updating
/// options kernel
/// Describes the process of Adam optimization options for the Batch
/// normalization Neuron (#CNeuronBatchNormOCL).
///\details Detailed description on <A
/// HREF="https://www.mql5.com/ru/articles/9207#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateBatchOptionsAdam(__global float *options, ///<[in,out] Options matrix m*9, where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int n = get_global_id(0);
int inputs = get_global_size(0);
const int shift = n * 9;
float grad = clamp(IsNaNOrInf(matrix_g[n], 0), -MAX_GRAD, MAX_GRAD);
//---
float nx = IsNaNOrInf(options[shift + 2], 0);
float gamma = IsNaNOrInf(options[shift + 3], 1);
if(gamma == 0)
gamma = 1;
float betta = IsNaNOrInf(options[shift + 4], 0);
//---
float gamma_m1 = IsNaNOrInf(options[shift + 5], 0);
float betta_m1 = IsNaNOrInf(options[shift + 6], 0);
float gamma_m2 = IsNaNOrInf(options[shift + 7], 0);
float betta_m2 = IsNaNOrInf(options[shift + 8], 0);
//---
float2 mt = b1 * (float2)(gamma_m1, betta_m1) +
(1 - b1) * (float2)(grad * nx, grad);
float2 grad2 = (float2)(grad * nx, grad);
float2 vt = b2 * (float2)(gamma_m2, betta_m2) +
(1 - b2) * (grad2 * grad2);
vt.s0 = IsNaNOrInf(vt.s0, 1);
vt.s1 = IsNaNOrInf(vt.s1, 1);
float2 delta = l * mt / sqrt(vt);
delta.s0 = IsNaNOrInf(delta.s0, 0);
delta.s1 = IsNaNOrInf(delta.s0, 0);
float2 weight = delta -
(l1 * sign((float2)(gamma, betta)) +
l2 * (float2)(gamma, betta) / inputs);
//---
options[shift + 3] = IsNaNOrInf(gamma + weight.s0, 1);
options[shift + 4] = IsNaNOrInf(betta + weight.s1, 0);
options[shift + 5] = IsNaNOrInf(mt.s0, 0);
options[shift + 6] = IsNaNOrInf(mt.s1, 0);
options[shift + 7] = vt.s0;
options[shift + 8] = vt.s1;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void VAE_FeedForward(__global float *inputs, __global float *random,
__global float *outputs)
{
uint i = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
outputs[i] = IsNaNOrInf(inputs[i], 0) + IsNaNOrInf(exp(0.5f * inputs[i + total]), 0) * random[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void VAE_CalcHiddenGradient(__global float *inputs,
__global float *inp_grad,
__global float *random,
__global float *gradient,
const float kld_mult)
{
uint i = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
float log_var = IsNaNOrInf(inputs[i + total], 0);
float mean = IsNaNOrInf(inputs[i], 0);
float kld =
kld_mult * 0.5f * (log_var - exp(log_var) - mean * mean + 1);
float grad = clamp(IsNaNOrInf(gradient[i], 0), -MAX_GRAD, MAX_GRAD);
inp_grad[i] = IsNaNOrInf(grad / exp(0.5f * log_var) + kld * mean, 0);
inp_grad[i + total] = IsNaNOrInf(0.5f * (grad * random[i] - kld * (1 - exp(log_var))), 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_FeedForward(__global const float *inputs, int inputs_size,
__global const float *weights,
__global float *concatenated,
__global float *memory, __global float *output)
{
uint id = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
uint id2 = (uint)get_local_id(1);
uint idv = (uint)get_global_id(2);
uint total_v = (uint)get_global_size(2);
//---
__local float Temp[4];
//---
float sum = 0;
uint shift_in = idv * inputs_size;
uint shift_out = idv * total;
uint shift = (inputs_size + total + 1) * (id2 + id);
//---
for(uint i = 0; i < total; i += 4)
{
if(total - i > 4)
sum += IsNaNOrInf(
dot((float4)(output[shift_out + i], output[shift_out + i + 1], output[shift_out + i + 2], output[shift_out + i + 3]),
(float4)(weights[shift + i], weights[shift + i + 1],
weights[shift + i + 2], weights[shift + i + 3])), 0);
else
for(uint k = i; k < total; k++)
sum += IsNaNOrInf(output[shift_out + k] * weights[shift + k], 0);
}
//---
shift += total;
//---
for(uint i = 0; i < inputs_size; i += 4)
{
if(total - i > 4)
sum += IsNaNOrInf(
dot((float4)(inputs[shift_in + i], inputs[shift_in + i + 1], inputs[shift_in + i + 2], inputs[shift_in + i + 3]),
(float4)(weights[shift + i], weights[shift + i + 1],
weights[shift + i + 2], weights[shift + i + 3])), 0);
else
for(uint k = i; k < total; k++)
sum += IsNaNOrInf(inputs[shift_in + k] * weights[shift + k], 0);
}
sum += IsNaNOrInf(weights[shift + inputs_size], 0);
if(id2 < 3)
sum = fActivation(sum, 1);
else
sum = fActivation(sum, 0);
Temp[id2] = sum;
concatenated[4 * shift_out + id2 * total + id] = sum;
//---
BarrierLoc
if(id2 == 0)
{
float mem = memory[shift_out + id + total_v * total] = memory[shift_out + id];
float fg = Temp[0];
float ig = Temp[1];
float og = Temp[2];
float nc = Temp[3];
//---
memory[shift_out + id] = mem = IsNaNOrInf(mem * fg + ig * nc, 0);
output[shift_out + id] = IsNaNOrInf(og * fActivation(mem, 0), 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_ConcatenatedGradient(__global float *gradient,
__global float *concatenated_gradient,
__global float *memory,
__global float *concatenated)
{
uint id = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
uint idv = (uint)get_global_id(1);
uint total_v = (uint)get_global_size(1);
//---
uint shift_out = idv * total;
float t = tanh(memory[shift_out + id]);
//---
concatenated_gradient[4 * shift_out + id + 2 * total] = gradient[shift_out + id] * t; // output gate
//---
float memory_gradient = gradient[shift_out + id] * concatenated[4 * shift_out + id + 2 * total];
memory_gradient *= 1 - t * t;
//---
concatenated_gradient[4 * shift_out + id + 3 * total] =
memory_gradient * concatenated[4 * shift_out + id + total]; // new content
//---
concatenated_gradient[4 * shift_out + id + total] =
memory_gradient * concatenated[4 * shift_out + id + 3 * total]; // input gate
//---
concatenated_gradient[4 * shift_out + id] =
memory_gradient * memory[shift_out + id + total_v * total]; // forgat gate
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_HiddenGradient(__global float *concatenated_gradient, __global float *inputs_gradient,
__global float *weights_gradient, __global float *hidden_state,
__global float *inputs, __global float *weights, __global float *output,
const int hidden_size, const int inputs_size)
{
uint id = get_global_id(0);
uint total = get_global_size(0);
uint idv = (uint)get_global_id(1);
uint total_v = (uint)get_global_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
uint ls = min(total_v, (uint)LOCAL_ARRAY_SIZE);
//---
uint shift_in = idv * inputs_size;
uint shift_out = idv * total;
uint weights_step = hidden_size + inputs_size + 1;
//---
//---
for(int i = id; i < (hidden_size + inputs_size); i += total)
{
float inp = 0;
if(i < hidden_size)
{
inp = hidden_state[shift_out + i];
hidden_state[shift_out + i] = output[shift_out + i];
}
else
{
inp = inputs[shift_in + i - hidden_size];
float grad = 0;
for(uint g = 0; g < 3 * hidden_size; g++)
{
float temp = concatenated_gradient[4 * shift_out + g];
grad += temp * (1 - temp) * weights[i + g * weights_step];
}
for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++)
{
float temp = concatenated_gradient[4 * shift_out + g];
grad += temp * (1 - temp * temp) * weights[i + g * weights_step];
}
inputs_gradient[shift_in + i - hidden_size] = grad;
}
//---
for(uint g = 0; g < 3 * hidden_size; g++)
{
float temp = concatenated_gradient[4 * shift_out + g];
if(idv < ls)
Temp[idv % ls] = 0;
BarrierLoc
for(uint v = 0; v < total_v; v += ls)
{
if(idv >= v && idv < v + ls)
Temp[idv % ls] += temp * (1 - temp) * inp;
BarrierLoc
}
if(idv == 0)
{
temp = Temp[0];
for(int v = 1; v < ls; v++)
temp += Temp[v];
weights_gradient[i + g * weights_step] = temp;
}
BarrierLoc
}
for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++)
{
float temp = concatenated_gradient[4 * shift_out + g];
if(idv < ls)
Temp[idv % ls] = 0;
BarrierLoc
for(uint v = 0; v < total_v; v += ls)
{
if(idv >= v && idv < v + ls)
Temp[idv % ls] += temp * (1 - temp * temp) * inp;
BarrierLoc
}
if(idv == 0)
{
temp = Temp[0];
for(int v = 1; v < ls; v++)
temp += Temp[v];
weights_gradient[i + g * weights_step] = temp;
}
BarrierLoc
}
}
//---
for(int i = id; i < 4 * hidden_size; i += total)
{
if(idv < ls)
Temp[idv % ls] = 0;
BarrierLoc
float temp = concatenated_gradient[4 * shift_out + i];
if(i < 3 * hidden_size)
{
for(uint v = 0; v < total_v; v += ls)
{
if(idv >= v && idv < v + ls)
Temp[idv % ls] += temp * (1 - temp);
BarrierLoc
}
}
else
{
for(uint v = 0; v < total_v; v += ls)
{
if(idv >= v && idv < v + ls)
Temp[idv % ls] += 1 - temp * temp;
BarrierLoc
}
}
if(idv == 0)
{
temp = Temp[0];
for(int v = 1; v < ls; v++)
temp += Temp[v];
weights_gradient[(i + 1) * weights_step] = temp;
}
BarrierLoc
}
}
//+------------------------------------------------------------------+
///\ingroup LSTM_opt LSTM Adam Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Neuron LSTM
/// (#CNeuronLSTMOCL).
//+------------------------------------------------------------------+
__kernel void LSTM_UpdateWeightsAdam(__global float *weights, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global float
*weights_gradient, ///<[in] Tensor of gradients at current layer
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const uint id = get_global_id(0);
const uint total = get_global_size(0);
const uint id1 = get_global_id(1);
const uint wi = id1 * total + id;
float g = clamp(IsNaNOrInf(weights_gradient[wi], 0), -MAX_GRAD, MAX_GRAD);
float mt = b1 * IsNaNOrInf(matrix_m[wi], 0) + (1 - b1) * g;
float vt = b2 * IsNaNOrInf(matrix_v[wi], 1) + (1 - b2) * (g * g);
float weight = IsNaNOrInf(weights[wi], 0);
float delta = l * (mt / (sqrt(vt) + 1.0e-37f) -
(l1 * sign(weight) + l2 * weight / total));
weights[wi] = IsNaNOrInf(weight + delta, 0);
matrix_m[wi] = mt;
matrix_v[wi] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_FeedForward(__global float *inputs,
__global float *outputs)
{
const uint total = (uint)get_local_size(0);
const uint l = (uint)get_local_id(0);
const uint h = (uint)get_global_id(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
uint shift_head = h * total;
//---
outputs[shift_head + l] = LocalSoftMax(IsNaNOrInf(inputs[shift_head + l], MIN_VALUE), 0, Temp);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_HiddenGradient(__global float *outputs,
__global float *output_gr,
__global float *input_gr)
{
size_t i = get_local_id(0);
size_t outputs_total = get_local_size(0);
size_t h = get_global_id(1);
__local float Temp[LOCAL_ARRAY_SIZE];
//---
uint shift = h * outputs_total;
float output = IsNaNOrInf(outputs[shift + i], 0);
float grad = IsNaNOrInf(output_gr[shift + i], 0);
input_gr[shift + i] = LocalSoftMaxGrad(output, grad, 0, Temp);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_OutputGradient(__global float *outputs,
__global float *targets,
__global float *output_gr)
{
size_t i = get_global_id(0);
output_gr[i] = (outputs[i] == 0 ? 0 : targets[i] / outputs[i]);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_Cosine(__global float *softmax, __global float *output)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float result = 0;
//---
for(int it = 0; it < i; it++)
result += softmax[shift + it];
result += softmax[shift + i] / 2.0f;
output[shift + i] = cos(i * M_PI_F * result);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_Output(__global float *quantiles, __global float *delta_taus,
__global float *output, int total)
{
size_t action = get_global_id(0);
int shift = action * total;
//---
float result = 0;
//---
for(int i = 0; i < total; i++)
result += quantiles[shift + i] * delta_taus[shift + i];
output[action] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_OutputGradient(__global float *quantiles,
__global float *delta_taus,
__global float *output_gr,
__global float *quantiles_gr,
__global float *taus_gr)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float gradient = output_gr[action];
quantiles_gr[shift + i] = gradient * delta_taus[shift + i];
taus_gr[shift + i] = gradient * quantiles[shift + i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_QuantileGradient(__global float *state_embeding,
__global float *taus_embeding,
__global float *quantiles_gr,
__global float *state_gr,
__global float *taus_gr)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float gradient = quantiles_gr[shift + i];
state_gr[shift + i] = gradient * taus_embeding[shift + i];
taus_gr[shift + i] = gradient * state_embeding[shift + i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_CosineGradient(__global float *softmax,
__global float *output_gr,
__global float *softmax_gr)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float cumul = 0;
//---
for(int it = 0; it < i; it++)
cumul += softmax[shift + it];
float result = -M_PI_F * i *
sin(M_PI_F * i * (cumul + softmax[shift + i] / 2)) *
output_gr[shift + i];
//---
for(int it = i + 1; it < total; it++)
{
cumul += softmax[shift + it - 1];
float temp = cumul + softmax[shift + it] / 2;
result += -M_PI_F * it * sin(M_PI_F * it * temp) * output_gr[shift + it] *
softmax[shift + it] / temp;
}
softmax_gr[shift + i] += result;
}
//+------------------------------------------------------------------+
//| Sparse Attention |
//+------------------------------------------------------------------+
__kernel void MHSparseAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
float sparse ///< less than 1.0 coefficient of sparse
)
{
int q = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
//---
int shift_q = dimension * (h + 3 * q * heads);
int shift_s = units * (h + q * heads);
int active_units = (int)max((float)(units * sparse), min((float)units, 3.0f));
//---
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0.0f;
float min_s = 0.0f;
float max_s = 0.0f;
//---
for(int k = 0; k < units; k++)
{
float result = 0;
int shift_k = dimension * (h + heads * (3 * k + 1));
for(int i = 0; i < dimension; i++)
{
if((dimension - i) > 4)
{
result += dot((float4)(qkv[shift_q + i], qkv[shift_q + i + 1],
qkv[shift_q + i + 2], qkv[shift_q + i + 3]),
(float4)(qkv[shift_k + i], qkv[shift_k + i + 1],
qkv[shift_k + i + 2], qkv[shift_k + i + 3]));
i += 3;
}
else
result += (qkv[shift_q + i] * qkv[shift_k + i]);
}
score[shift_s + k] = result;
if(k == 0)
min_s = max_s = result;
else
{
max_s = max(max_s, result);
min_s = min(min_s, result);
}
}
//---
int count = units;
//---
while(count > active_units && min_s < max_s)
{
count = 0;
float temp = max_s;
for(int k = 0; k < units; k++)
{
float value = score[shift_s + k];
if(value < min_s)
continue;
count++;
if(value < temp && value > min_s)
temp = value;
}
if(count > active_units)
min_s = temp;
}
//---
if(max_s == 0.0f)
max_s = 1.0f;
//---
for(int k = 0; k < units; k++)
{
float value = score[shift_s + k];
if(value < min_s)
{
score[shift_s + k] = 0.0f;
continue;
}
value = exp(value / max_s / koef);
score[shift_s + k] = value;
sum += value;
}
//---
for(int k = 0; (k < units && sum > 1); k++)
{
float temp = score[shift_s + k];
if(temp == 0.0f)
continue;
score[shift_s + k] = temp / sum;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHSparseAttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *qkv, ///<[in] Matrix of Values
__global float *out, ///<[out] Output tensor
int dimension ///< Dimension of Value
)
{
int u = get_global_id(0);
int units = get_global_size(0);
int h = get_global_id(1);
int heads = get_global_size(1);
//---
int shift_s = units * (h + heads * u);
int shift_out = dimension * (h + heads * u);
//---
for(int d = 0; d < dimension; d++)
{
float result = 0;
for(int v = 0; v < units; v++)
{
float cur_score = scores[shift_s + v];
if(cur_score == 0)
continue;
int shift_v = dimension * (h + heads * (3 * v + 2)) + d;
result += cur_score * qkv[shift_v];
}
out[shift_out + d] = result;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMultiModels(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number of neurons in layer and n - number of outputs (neurons in next layer)
__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_o, ///<[out] Output tensor
int inputs, ///< Number of inputs
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
int outputs = get_global_size(0);
int m = get_global_id(1);
int models = get_global_size(1);
//---
float sum = 0;
float4 inp, weight;
int shift = (inputs + 1) * (i + outputs * m);
int shift_in = inputs * m;
int shift_out = outputs * m;
//---
for(int k = 0; k <= inputs; k = k + 4)
{
switch(inputs - k)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_in + k], 1, 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], 1, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1],
matrix_i[shift_in + k + 2], 1);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
default:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1],
matrix_i[shift_in + k + 2], matrix_i[shift_in + k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
float d = dot(inp, weight);
if(isnan(sum + d))
continue;
sum += d;
}
if(isnan(sum))
sum = 0;
//---
matrix_o[shift_out + i] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMultiModels(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number
///< of neurons in previous layer and n - number
///< of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Previous layer Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int activation, ///< Activation type (#ENUM_ACTIVATION),
int model)
{
int i = get_global_id(0);
int inputs = get_global_size(0);
int m = get_global_id(1);
int models = get_global_size(1);
//---
int shift_in = inputs * m;
if(model >= 0 && model != m)
{
matrix_ig[shift_in + i] = 0;
return;
}
//---
int shift_out = outputs * m;
int shift_w = (inputs + 1) * outputs * m;
float sum = 0;
float out = matrix_o[shift_in + i];
float4 grad, weight;
//---
for(int k = 0; k < outputs; k += 4)
{
switch(outputs - k)
{
case 1:
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], 0, 0, 0);
grad = (float4)(matrix_g[shift_out + k], 0, 0, 0);
break;
case 2:
grad =
(float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], 0, 0);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i],
matrix_w[shift_w + (k + 1) * (inputs + 1) + i], 0, 0);
break;
case 3:
grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1],
matrix_g[shift_out + k + 2], 0);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i],
matrix_w[shift_w + (k + 1) * (inputs + 1) + i],
matrix_w[shift_w + (k + 2) * (inputs + 1) + i], 0);
break;
default:
grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1],
matrix_g[shift_out + k + 2], matrix_g[shift_out + k + 3]);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i],
matrix_w[shift_w + (k + 1) * (inputs + 1) + i],
matrix_w[shift_w + (k + 2) * (inputs + 1) + i],
matrix_w[shift_w + (k + 3) * (inputs + 1) + i]);
break;
}
sum += dot(grad, weight);
}
//---
matrix_ig[shift_in + i] = Deactivation(sum, out, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdamMultiModels(
__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float
*matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
const int model)
{
const int outputs = get_global_size(0);
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = (i + outputs * model) * (inputs + 1) + j * 4;
float4 m, v, weight, inp;
int shift_in = j * 4 + inputs * model;
if((inputs + 1 - j * 4) < 0)
return;
switch(inputs + 1 - j * 4)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[wi], 0, 0, 0);
m = (float4)(matrix_m[wi], 0, 0, 0);
v = (float4)(matrix_v[wi], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_in], 1, 0, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], 0, 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], 1, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1],
matrix_i[shift_in + 2], 1);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2],
matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2],
matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2],
matrix_v[wi + 3]);
break;
default:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1],
matrix_i[shift_in + 2], matrix_i[shift_in + 3]);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2],
matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2],
matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2],
matrix_v[wi + 3]);
break;
}
float4 g = (float4)(matrix_g[(outputs + 1) * model + i]) * inp;
float4 mt = b1 * m + (1 - b1) * g;
float4 vt = b2 * v + (1 - b2) * (g * g);
float4 delta =
l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
switch(min(inputs + 1 - j * 4, 3))
{
case 3:
if(fabs(delta.s3) > 0)
matrix_w[wi + 3] = matrix_w[wi + 3] + delta.s3;
matrix_m[wi + 3] = mt.s3;
matrix_v[wi + 3] = vt.s3;
case 2:
if(fabs(delta.s2) > 0)
matrix_w[wi + 2] = matrix_w[wi + 2] + delta.s2;
matrix_m[wi + 2] = mt.s2;
matrix_v[wi + 2] = vt.s2;
case 1:
if(fabs(delta.s1) > 0)
matrix_w[wi + 1] = matrix_w[wi + 1] + delta.s1;
matrix_m[wi + 1] = mt.s1;
matrix_v[wi + 1] = vt.s1;
case 0:
if(fabs(delta.s0) > 0)
matrix_w[wi] = matrix_w[wi] + delta.s0;
matrix_m[wi] = mt.s0;
matrix_v[wi] = vt.s0;
break;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Concat_FeedForward(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number
///< of neurons in layer and n - number of outputs
///< (neurons in next layer)
__global float *matrix_i1, ///<[in] Inputs 1 tensor
__global float *matrix_i2, ///<[in] Inputs 2 tensor
__global float *matrix_o, ///<[out] Output tensor
int inputs1, ///< Number of inputs
int inputs2, ///< Number of inputs
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
float sum = 0;
float4 inp, weight;
int shift = (inputs1 + inputs2 + 1) * i;
//---
//---
for(int k = 0; k < inputs1; k += 4)
{
switch(inputs1 - k)
{
case 1:
inp = (float4)(matrix_i1[k], 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 2:
inp = (float4)(matrix_i1[k], matrix_i1[k + 1], 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 3:
inp = (float4)(matrix_i1[k], matrix_i1[k + 1], matrix_i1[k + 2], 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], 0);
break;
default:
inp = (float4)(matrix_i1[k], matrix_i1[k + 1], matrix_i1[k + 2],
matrix_i1[k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
float d = dot(inp, weight);
if(isnan(sum + d))
continue;
sum += d;
}
//---
shift += inputs1;
//---
for(int k = 0; k < inputs2; k += 4)
{
switch(inputs2 - k)
{
case 1:
inp = (float4)(matrix_i2[k], 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 2:
inp = (float4)(matrix_i2[k], matrix_i2[k + 1], 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 3:
inp = (float4)(matrix_i2[k], matrix_i2[k + 1], matrix_i2[k + 2], 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], 0);
break;
default:
inp = (float4)(matrix_i2[k], matrix_i2[k + 1], matrix_i2[k + 2],
matrix_i2[k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1],
matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
float d = dot(inp, weight);
if(isnan(sum + d))
continue;
sum += d;
}
sum += matrix_w[shift + inputs2];
//---
if(isnan(sum))
sum = 0;
//---
matrix_o[i] = fActivation(sum, activation);;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Concat_HiddenGradient(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o1, ///<[in] Previous layer Output tensor
__global float *matrix_o2, ///<[in] Previous layer Output tensor
__global float *matrix_ig1, ///<[out] Tensor of gradients at previous layer
__global float *matrix_ig2, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int inputs1, int inputs2,
int activation1, ///< Activation type (#ENUM_ACTIVATION)
int activation2 ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
if(i >= (inputs1 + inputs2))
return;
int inputs = inputs1 + inputs2;
float sum = 0;
float out = (i < inputs1 ? matrix_o1[i] : matrix_o2[i - inputs1]);
float4 grad, weight;
//---
for(int k = 0; k < outputs; k += 4)
{
switch(outputs - k)
{
case 1:
weight = (float4)(matrix_w[k * (inputs + 1) + i], 0, 0, 0);
grad = (float4)(matrix_g[k], 0, 0, 0);
break;
case 2:
grad = (float4)(matrix_g[k], matrix_g[k + 1], 0, 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i], 0, 0);
break;
case 3:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i],
matrix_w[(k + 2) * (inputs + 1) + i], 0);
break;
default:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2],
matrix_g[k + 3]);
weight = (float4)(matrix_w[k * (inputs + 1) + i],
matrix_w[(k + 1) * (inputs + 1) + i],
matrix_w[(k + 2) * (inputs + 1) + i],
matrix_w[(k + 3) * (inputs + 1) + i]);
break;
}
sum += dot(grad, weight);
}
if(isnan(sum))
sum = 0;
if(i < inputs1)
matrix_ig1[i] = Deactivation(sum, out, activation1);
else
matrix_ig2[i - inputs1] = Deactivation(sum, out, activation2);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Concat_UpdateWeightsMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i1, ///<[in] Inputs tensor
__global float *matrix_i2, ///<[in] Inputs tensor
__global float
*matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs1, ///< Number of inputs
int inputs2, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum ///< Momentum multiplier
)
{
int i = get_global_id(0);
int j = get_global_id(1);
if(j > (inputs1 + inputs2))
return;
int wi = i * (inputs1 + inputs2 + 1) + j;
float inp = (j < inputs1 ? matrix_i1[j] : ((j - inputs1) < inputs2 ? matrix_i2[j - inputs1] : 1));
float delta = learning_rates * matrix_g[i] * inp + momentum * matrix_dw[wi];
if(!isnan(delta))
{
matrix_dw[wi] = delta;
if(fabs(delta) > 0)
matrix_w[wi] = matrix_w[wi] + delta;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Concat_UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float
*matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i1, ///<[in] Inputs tensor
__global const float *matrix_i2, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const int inputs1, ///< Number of inputs
const int inputs2, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
if(j > (inputs1 + inputs2))
return;
const int wi = i * (inputs1 + inputs2 + 1) + j;
float inp =
(j < inputs1 ? matrix_i1[j]
: ((j - inputs1) < inputs2 ? matrix_i2[j - inputs1] : 1));
float weight = matrix_w[wi];
float g = matrix_g[i] * inp;
float mt = b1 * matrix_m[wi] + (1 - b1) * g;
float vt = b2 * matrix_v[wi] + (1 - b2) * (g * g);
float delta =
l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
if(fabs(delta) > 0)
matrix_w[wi] = matrix_w[wi] + delta;
matrix_m[wi] = mt;
matrix_v[wi] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftUpdate(__global float *target, ///<[in,out] Target matrix
__global const float *source, ///<[in] Source matrix
const float tau ///<[in] Multiplicator Tau
)
{
const int i = get_global_id(0);
target[i] = source[i] * tau + (1.0f - tau) * target[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftUpdateAdam(__global float *target, __global const float *source,
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const float tau, ///<[in] Multiplicator Tau
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
float m, v, weight;
m = matrix_m[i];
v = matrix_v[i];
weight = target[i];
float g = source[i] - weight;
m = b1 * m + (1 - b1) * g;
v = b2 * v + (1 - b2) * (g * g);
float delta = (1 - tau) * m / (v != 0.0f ? sqrt(v) : 1.0f);
if(fabs(delta) > 0)
target[i] = weight + delta;
matrix_m[i] = m;
matrix_v[i] = v;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SAC_AlphaLogProbs(__global float *outputs,
__global float *quantiles,
__global float *probs,
__global float *alphas,
__global float *log_probs,
__global float *random,
const int count_quants,
const int activation)
{
const int i = get_global_id(0);
int shift = i * count_quants;
float prob = 0;
float value = 0;
float sum = 0;
float rnd = random[i];
//---
for(int r = 0; r < count_quants; r++)
{
prob = probs[shift + r];
sum += prob;
if(sum >= rnd || r == (count_quants - 1))
{
value = quantiles[shift + r];
break;
}
}
//---
outputs[i] = fActivation(value, activation);
log_probs[i] = -alphas[i] * log(prob);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SAC_AlphaGradients(__global float *outputs,
__global float *gradient,
__global float *log_probs,
__global float *alphas_grad,
const int activation)
{
const int i = get_global_id(0);
float out = outputs[i];
//---
float grad = -gradient[i] * log_probs[i];
//---
alphas_grad[i] = Deactivation(grad, out, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SAC_OutputGradient(__global float *quantiles, __global float *delta_taus,
__global float *output_gr, __global float *quantiles_gr,
__global float *taus_gr, __global float *output,
const int count_quants, const int activation)
{
size_t action = get_global_id(0);
int shift = action * count_quants;
float quant1 = -1e37f;
float quant2 = 1e37f;
int pos1 = -1;
int pos2 = -1;
float value = output[action];
//---
for(int i = 0; i < count_quants; i++)
{
float quant = fActivation(quantiles[shift + i], activation);
if(value >= quant && quant1 < quant)
{
quant1 = quant;
pos1 = shift + i;
}
if(value < quant && quant2 > quant)
{
quant2 = quant;
pos2 = shift + i;
}
quantiles_gr[shift + i] = 0.0f;
taus_gr[shift + i] = 0.0f;
}
float gradient = output_gr[action];
if(quant1 > -1e37f)
{
quantiles_gr[pos1] = gradient * delta_taus[pos1];
taus_gr[pos1] = gradient * quant1;
}
if(quant2 < 1e37f)
{
quantiles_gr[pos2] = gradient * delta_taus[pos2];
taus_gr[pos2] = gradient * quant2;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SAC_CalcLogProbs(__global float *outputs,
__global float *quantiles, __global float *probs,
__global float *alphas,
__global float *log_probs,
const int count_quants, const int activation)
{
const int i = get_global_id(0);
int shift = i * count_quants;
float quant1 = -1e37f;
float quant2 = 1e37f;
float prob1 = 0;
float prob2 = 0;
float value = outputs[i];
//---
for(int q = 0; q < count_quants; q++)
{
float quant = fActivation(quantiles[shift + q], activation);
if(value >= quant && quant1 < quant)
{
quant1 = quant;
prob1 = probs[shift + q];
}
if(value < quant && quant2 > quant)
{
quant2 = quant;
prob2 = probs[shift + q];
}
}
//---
float prob = fabs(value - quant1) / fabs(quant2 - quant1);
prob = clamp((1 - prob) * prob1 + prob * prob2, 1.0e-3f, 1.0f);
log_probs[i] = -alphas[i] * log(prob);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Embedding(__global float *inputs, __global float *outputs,
__global float *weights, __global int *windows,
__global float *std, const int stack_size)
{
const int window_out = get_global_size(0);
const int pos = get_global_id(0);
const int emb = get_global_id(1);
const int emb_total = get_global_size(1);
const int shift_out = emb * window_out + pos;
const int step = emb_total * window_out;
const uint ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE);
//---
for(int i = stack_size - 1; i > 0; i--)
outputs[i * step + shift_out] = outputs[(i - 1) * step + shift_out];
int shift_in = 0;
//---
for(int i = 0; i < emb; i++)
shift_in += windows[i];
const int window_in = windows[emb];
const int shift_weights = (shift_in + emb) * window_out + (window_in + 1) * pos;
//---
__local float temp[LOCAL_ARRAY_SIZE];
if(pos < LOCAL_ARRAY_SIZE)
temp[pos] = 0;
BarrierLoc
//---
float value = weights[shift_weights + window_in];
//---
for(int i = 0; i < window_in; i++)
value += inputs[shift_in + i] * weights[shift_weights + i];
//---
for(int i = 0; i < window_out; i += ls)
{
if(pos >= i && pos < (i + ls))
temp[pos % ls] += value;
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(pos + count < ls)
{
if(pos < count)
temp[pos] += temp[pos + count];
temp[pos + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
value -= temp[0] / (float)window_out;
BarrierLoc
//---
if(pos < LOCAL_ARRAY_SIZE)
temp[pos] = 0;
BarrierLoc
//---
for(int i = 0; i < window_out; i += ls)
{
if(pos >= i && pos < (i + ls))
temp[pos % ls] += (value * value) / (float)window_out;
BarrierLoc
}
//---
count = ls;
do
{
count = (count + 1) / 2;
if(pos + count < ls)
{
if(pos < count)
temp[pos] += temp[pos + count];
temp[pos + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
if(temp[0] > 0)
value /= sqrt(temp[0]);
//---
outputs[shift_out] = value;
if(pos == 0)
std[emb] = sqrt(temp[0]);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void EmbeddingHiddenGradient(__global float *inputs_gradient,
__global float *outputs_gradient,
__global float *weights,
__global int *windows,
__global float *std,
const int window_out)
{
const int pos = get_global_id(0);
int emb = -1;
int count = 0;
do
{
emb++;
count += windows[emb];
}
while(count < pos);
const int shift_out = emb * window_out;
const int shift_weights = pos + (count - windows[emb] + emb) * window_out;
//---
float value = 0;
//---
for(int i = 0; i < window_out; i++)
value += outputs_gradient[shift_out + i] * weights[shift_weights + i * window_out];
float s = std[emb];
if(s > 0)
value /= s;
//---
inputs_gradient[pos] = value;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void EmbeddingUpdateWeightsAdam(__global float *weights, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float *gradient, ///<[in] Tensor of gradients at current layer
__global const float *inputs, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
__global int *windows, __global float *std, const int window_out,
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
int emb = -1;
int count = 0;
int shift = 0;
int window_in = 0;
do
{
emb++;
shift = count;
window_in = windows[emb];
count += (window_in + 1) * window_out;
}
while(count <= i);
const int shift_out = emb * window_out;
int shift_in = shift / window_out - emb;
shift = (i - shift) % (window_in + 1);
float inp = 1.0f;
if(shift < window_in)
inp = inputs[shift_in + shift];
//---
float weight = weights[i];
float g = gradient[shift_out] * inp / std[emb];
float mt = b1 * matrix_m[i] + (1 - b1) * g;
float vt = b2 * matrix_v[i] + (1 - b2) * (g * g);
float delta =
l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
if(fabs(delta) > 0)
weights[i] = weights[i] + delta;
matrix_m[i] = mt;
matrix_v[i] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Transpose(__global float *matrix_in, ///<[in] Input matrix
__global float *matrix_out ///<[out] Output matrix
)
{
const int r = get_global_id(0);
const int c = get_global_id(1);
const int rows = get_global_size(0);
const int cols = get_global_size(1);
//---
matrix_out[c * rows + r] = matrix_in[r * cols + c];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MH2AttentionOut(__global float *q, ///<[in] Matrix of Querys
__global float *kv, ///<[in] Matrix of Keys
__global float *score, ///<[out] Matrix of Scores
__global float *out, ///<[out] Matrix of attention
int dimension, ///< Dimension of Key
int heads_kv,
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
//--- init
const int q_id = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_local_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_k = dimension * (2 * heads_kv * k + h_kv);
const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k;
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
float sum = MIN_VALUE;
if(mask == 0 || q_id >= k)
{
sum = 0;
for(int d = 0; d < dimension; d++)
sum += q[shift_q + d] * kv[shift_k + d];
}
float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp);
score[shift_s] = sc;
//--- out
for(int d = 0; d < dimension; d++)
{
BarrierLoc
sum = LocalSum(IsNaNOrInf(kv[shift_v + d ] * sc, 0), 1, temp);
//---
if(k == 0)
out[shift_q + d] = sum;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MH2AttentionInsideGradients(__global float *q, __global float *q_g,
__global float *kv, __global float *kv_g,
__global float *scores, __global float *gradient,
int kunits, int heads_kv)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = q_id * kunits * heads + h * kunits;
const int shift_g = h * dimension + d;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v = q_id; v < kunits; v += qunits)
{
float grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v;
for(int g = 0; g < qunits; g++)
grad += gradient[shift_g + dimension * (hq - h + g * heads)] *
scores[shift_score + g * step_score];
}
int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d;
kv_g[shift_v] = grad;
}
}
//--- Calculating Query's gradients
float grad = 0;
float out_g = gradient[shift_g + q_id * dimension];
int shift_val = (heads_kv + h_kv) * dimension + d;
int shift_key = h_kv * dimension + d;
//---
for(int k = 0; k < kunits; k++)
{
float sc_g = 0;
float sc = scores[shift_s + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] *
((float)(k == v) - sc);
grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension];
}
q_g[shift_q] = grad / koef;
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k = q_id; k < kunits; k += qunits)
{
int shift_k = dimension * (2 * heads_kv * k + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq++)
{
int shift_score = hq * kunits + k;
float val = kv[shift_k + heads_kv * dimension];
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] *
val * ((float)(k == v) - sc);
grad += sc_g * q[shift_q + scr * dimension];
}
}
kv_g[shift_k] = grad / koef;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CGConv_HiddenGradient(__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_f, ///<[in] Previous layer Output tensor
__global const float *matrix_s, ///<[in] Previous layer Output tensor
__global float *matrix_fg, ///<[out] Tensor of gradients at previous layer
__global float *matrix_sg, ///<[out] Tensor of gradients at previous layer
const int activationf, ///< Activation type (#ENUM_ACTIVATION)
const int activations ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
//---
float grad = matrix_g[i];
float f = matrix_f[i];
float s = matrix_s[i];
//---
float sg = grad * f;
float fg = grad * s;
//---
matrix_fg[i] = Deactivation(fg, f, activationf);
matrix_sg[i] = Deactivation(sg, s, activations);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void XCiTFeedForward(__global float *qkv, __global float *score,
__global float *out)
{
const size_t d = get_local_id(0);
const size_t dimension = get_local_size(0);
const size_t u = get_local_id(1);
const size_t units = get_local_size(1);
const size_t h = get_global_id(2);
const size_t heads = get_global_size(2);
//---
const uint ls_u = min((uint)units, (uint)LOCAL_ARRAY_SIZE);
const uint ls_d = min((uint)dimension, (uint)LOCAL_ARRAY_SIZE);
__local float q[LOCAL_ARRAY_SIZE][LOCAL_ARRAY_SIZE];
__local float k[LOCAL_ARRAY_SIZE][LOCAL_ARRAY_SIZE];
//--- Normalize Query and Key
for(int cur_d = 0; cur_d < dimension; cur_d += ls_d)
{
float q_val = 0;
float k_val = 0;
//---
if(d < ls_d && (cur_d + d) < dimension && u < ls_u)
{
for(int count = u; count < units; count += ls_u)
{
int shift = count * dimension * heads * 3 + dimension * h + cur_d + d;
q_val += qkv[shift] * qkv[shift];
k_val += qkv[shift + dimension * heads] * qkv[shift + dimension * heads];
}
q[u][d] = q_val;
k[u][d] = k_val;
}
BarrierLoc
//---
uint count = ls_u;
do
{
count = (count + 1) / 2;
if(d < ls_d)
{
if(u < ls_u && u < count && (u + count) < units)
{
float q_val = q[u][d] + q[u + count][d];
float k_val = k[u][d] + k[u + count][d];
q[u + count][d] = 0;
k[u + count][d] = 0;
q[u][d] = q_val;
k[u][d] = k_val;
}
}
BarrierLoc
}
while(count > 1);
//---
int shift = u * dimension * heads * 3 + dimension * h + cur_d;
qkv[shift] = qkv[shift] / sqrt(q[0][d]);
qkv[shift + dimension * heads] =
qkv[shift + dimension * heads] / sqrt(k[0][d]);
BarrierLoc
}
//--- Score
int step = dimension * heads * 3;
//---
for(int cur_r = 0; cur_r < dimension; cur_r += ls_u)
{
for(int cur_d = 0; cur_d < dimension; cur_d += ls_d)
{
if(u < ls_d && d < ls_d)
q[u][d] = 0;
BarrierLoc
//---
if((cur_r + u) < ls_d && (cur_d + d) < ls_d)
{
int shift_q = dimension * h + cur_d + d;
int shift_k = dimension * (heads + h) + cur_r + u;
float scr = 0;
for(int i = 0; i < units; i++)
scr += qkv[shift_q + i * step] * qkv[shift_k + i * step];
scr = exp(scr / sqrt((float)units));
score[(cur_r + u) * dimension * heads + dimension * h + cur_d + d] =
scr;
q[u][d] += scr;
}
}
BarrierLoc
//---
int count = ls_d;
do
{
count = (count + 1) / 2;
if(u < ls_d)
{
if(d < ls_d && d < count && (d + count) < dimension)
q[u][d] += q[u][d + count];
if(d + count < ls_d)
q[u][d + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
if((cur_r + u) < ls_d)
score[(cur_r + u) * dimension * heads + dimension * h + d] /= q[u][0];
BarrierLoc
}
//---
int shift_out = dimension * (u * heads + h) + d;
int shift_s = dimension * (heads * d + h);
int shift_v = dimension * (heads * (u * 3 + 2) + h);
float sum = 0;
//---
for(int i = 0; i < dimension; i++)
sum += qkv[shift_v + i] * score[shift_s + i];
out[shift_out] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void XCiTInsideGradients(__global float *qkv, __global float *qkv_g,
__global float *scores,
__global float *gradient)
{
//--- init
const int q = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int units = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int shift_q = dimension * (heads * 3 * q + h);
const int shift_k = dimension * (heads * (3 * q + 1) + h);
const int shift_v = dimension * (heads * (3 * q + 2) + h);
const int shift_g = dimension * (heads * q + h);
int shift_score = dimension * h;
int step_score = dimension * heads;
//--- Calculating Value's gradients
float sum = 0;
//---
for(int i = 0; i < dimension; i++)
sum += gradient[shift_g + i] * scores[shift_score + d + i * step_score];
qkv_g[shift_v + d] = sum;
//--- Calculating Query's gradients
float grad = 0;
float val = qkv[shift_v + d];
//---
for(int k = 0; k < dimension; k++)
{
float sc_g = 0;
float sc = scores[shift_score + k];
for(int v = 0; v < dimension; v++)
sc_g += scores[shift_score + v] * val *
gradient[shift_g + v * dimension] * ((float)(k == v) - sc);
grad += sc_g * qkv[shift_k + k];
}
qkv_g[shift_q + d] = grad / sqrt((float)units);
//--- Calculating Key's gradients
grad = 0;
float out_g = gradient[shift_g];
//---
for(int scr = 0; scr < dimension; scr++)
{
float sc_g = 0;
int shift_sc = scr * dimension * heads;
float sc = scores[shift_sc + d];
for(int v = 0; v < dimension; v++)
sc_g += scores[shift_sc + v] * out_g * qkv[shift_v + v] *
((float)(d == v) - sc);
grad += sc_g * qkv[shift_q + scr];
}
qkv_g[shift_k + d] = grad / sqrt((float)units);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DOTFeedForward(__global float *qkv, __global float *score,
__global float *rpb, __global float *out)
{
const size_t d = get_local_id(0);
const size_t dimension = get_local_size(0);
const size_t u = get_global_id(1);
const size_t units = get_global_size(1);
const size_t h = get_global_id(2);
const size_t heads = get_global_size(2);
//---
uint step = 3 * dimension * heads;
uint start = max((int)u - 1, 0);
uint stop = min((int)u + 1, (int)units - 1);
uint shift_q = u * step + h * dimension;
uint shift_k = start * step + dimension * (heads + h);
uint shift_score = u * 3 * heads;
//---
const uint ls_d = min((uint)dimension, (uint)LOCAL_ARRAY_SIZE);
__local float temp[LOCAL_ARRAY_SIZE][3];
//--- Score
if(d < ls_d)
{
//---
for(uint pos = start; pos <= stop; pos++)
temp[d][pos - start] = 0;
//---
for(uint dim = d; dim < dimension; dim += ls_d)
{
float q = qkv[shift_q + dim];
for(uint pos = start; pos <= stop; pos++)
{
uint i = pos - start;
temp[d][i] = temp[d][i] + q * qkv[shift_k + i * step + dim];
}
}
BarrierLoc
//---
int count = ls_d;
//---
do
{
count = (count + 1) / 2;
if(d < count && (d + count) < dimension)
for(uint i = 0; i <= (stop - start); i++)
{
temp[d][i] += temp[d + count][i];
temp[d + count][i] = 0;
}
BarrierLoc
}
while(count > 1);
}
//---
if(d == 0)
{
float sum = 0;
//---
for(uint i = 0; i <= (stop - start); i++)
{
temp[0][i] = exp(temp[0][i] + rpb[shift_score + i]);
sum += temp[0][i];
}
//---
for(uint i = 0; i <= (stop - start); i++)
{
temp[0][i] = temp[0][i] / sum;
score[shift_score + i] = temp[0][i];
}
}
BarrierLoc
//---
int shift_out = dimension * (u * heads + h) + d;
int shift_v = dimension * (heads * (u * 3 + 2) + h);
float sum = 0;
//---
for(uint i = 0; i <= (stop - start); i++)
sum += qkv[shift_v + i] * temp[0][i];
out[shift_out] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DOTInsideGradients(__global float *qkv, __global float *qkv_g,
__global float *scores, __global float *rpb,
__global float *rpb_g,
__global float *gradient)
{
//--- init
const uint u = get_global_id(0);
const uint d = get_global_id(1);
const uint h = get_global_id(2);
const uint units = get_global_size(0);
const uint dimension = get_global_size(1);
const uint heads = get_global_size(2);
//---
uint step = 3 * dimension * heads;
uint start = max((int)u - 1, 0);
uint stop = min((int)u + 1, (int)units - 1);
const uint shift_q = u * step + dimension * h + d;
const uint shift_k = u * step + dimension * (heads + h) + d;
const uint shift_v = u * step + dimension * (2 * heads + h) + d;
//--- Calculating Value's gradients
float sum = 0;
//---
for(uint i = start; i <= stop; i++)
{
int shift_score = i * 3 * heads;
if(u == i)
{
shift_score += (uint)(u > 0);
}
else
{
if(u > i)
shift_score += (uint)(start > 0) + 1;
}
uint shift_g = dimension * (i * heads + h) + d;
sum += gradient[shift_g] * scores[shift_score];
}
qkv_g[shift_v] = sum;
//--- Calculating Query's gradients
float grad = 0;
uint shift_score = u * heads * 3;
//---
for(int k = start; k <= stop; k++)
{
float sc_g = 0;
float sc = scores[shift_score + k - start];
for(int v = start; v <= stop; v++)
for(int dim = 0; dim < dimension; dim++)
sc_g += scores[shift_score + v - start] *
qkv[v * step + dimension * (2 * heads + h) + dim] *
gradient[dimension * (u * heads + h) + dim] *
((float)(k == v) - sc);
grad += sc_g * qkv[k * step + dimension * (heads + h) + d];
if(d == 0)
rpb_g[shift_score + k - start] = sc_g;
}
qkv_g[shift_q] = grad;
//--- Calculating Key's gradients
grad = 0;
//---
for(int q = start; q <= stop; q++)
{
float sc_g = 0;
shift_score = q * heads * 3;
if(u == q)
shift_score += (uint)(u > 0);
else
{
if(u > q)
shift_score += (uint)(start > 0) + 1;
}
float sc = scores[shift_score];
for(int v = start; v <= stop; v++)
{
shift_score = v * heads * 3;
if(u == v)
shift_score += (uint)(u > 0);
else
{
if(u > v)
shift_score += (uint)(start > 0) + 1;
}
for(int dim = 0; dim < dimension; dim++)
sc_g += scores[shift_score] * qkv[shift_v - d + dim] *
gradient[dimension * (v * heads + h) + d] *
((float)(d == v) - sc);
}
grad += sc_g * qkv[q * step + dimension * h + d];
}
qkv_g[shift_k] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RPBUpdateAdam(__global float *target, __global const float *gradient,
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
float m, v, weight;
m = matrix_m[i];
v = matrix_v[i];
weight = target[i];
float g = gradient[i];
m = b1 * m + (1 - b1) * g;
v = b2 * v + (1 - b2) * (g * g);
float delta = m / (v != 0.0f ? sqrt(v) : 1.0f);
target[i] = weight + delta;
matrix_m[i] = m;
matrix_v[i] = v;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GTEFeedForward(__global float *qkv, __global float *score,
__global float *out, int dimension)
{
const size_t cur_q = get_global_id(0);
const size_t units_q = get_global_size(0);
const size_t cur_k = get_local_id(1);
const size_t units_k = get_local_size(1);
const size_t h = get_global_id(2);
const size_t heads = get_global_size(2);
//---
int shift_q = dimension * (cur_q + h * units_q);
int shift_k = (cur_k + h * units_k + heads * units_q);
int shift_v = dimension * (h * units_k + heads * (units_q + units_k));
int shift_score_con = units_k * (cur_q * 2 * heads + h) + cur_k;
int shift_score_notcon = units_k * (cur_q * 2 * heads + heads + h) + cur_k;
int shift_out_con = dimension * (cur_q + h * units_q);
int shift_out_notcon = dimension * (cur_q + units_q * (h + heads));
//---
const uint ls_score = min((uint)units_k, (uint)LOCAL_ARRAY_SIZE);
__local float local_score[LOCAL_ARRAY_SIZE][2];
//--- Score
float scr = 0;
//---
for(int d = 0; d < dimension; d++)
scr += qkv[shift_q + d] * qkv[shift_k + d];
scr = exp(min(scr / sqrt((float)dimension), 30.0f));
if(cur_q == cur_k)
{
score[shift_score_con] = scr;
score[shift_score_notcon] = scr;
if(cur_k < ls_score)
{
local_score[cur_k][0] = scr;
local_score[cur_k][1] = scr;
}
}
else
{
if(abs(cur_q - cur_k) == 1)
{
score[shift_score_con] = scr;
score[shift_score_notcon] = 0;
if(cur_k < ls_score)
{
local_score[cur_k][0] = scr;
local_score[cur_k][1] = 0;
}
}
else
{
score[shift_score_con] = 0;
score[shift_score_notcon] = scr;
if(cur_k < ls_score)
{
local_score[cur_k][0] = 0;
local_score[cur_k][1] = scr;
}
}
}
BarrierLoc
//---
for(int k = ls_score; k < units_k; k += ls_score)
{
if((cur_k + k) < units_k)
{
local_score[cur_k][0] += score[shift_score_con + k];
local_score[cur_k][1] += score[shift_score_notcon + k];
}
}
BarrierLoc
//---
int count = ls_score;
do
{
count = (count + 1) / 2;
if(cur_k < count)
{
if((cur_k + count) < units_k)
{
local_score[cur_k][0] += local_score[cur_k + count][0];
local_score[cur_k][1] += local_score[cur_k + count][1];
local_score[cur_k + count][0] = 0;
local_score[cur_k + count][1] = 0;
}
}
BarrierLoc
}
while(count > 1);
BarrierLoc
//---
score[shift_score_con] /= local_score[0][0];
score[shift_score_notcon] /= local_score[0][1];
BarrierLoc
//---
shift_score_con -= cur_k;
shift_score_notcon -= cur_k;
//---
for(int d = 0; d < dimension; d += ls_score)
{
if((cur_k + d) < dimension)
{
float sum_con = 0;
float sum_notcon = 0;
for(int v = 0; v < units_k; v++)
{
sum_con += qkv[shift_v + v * dimension + cur_k + d] *
score[shift_score_con + v];
sum_notcon += qkv[shift_v + v * dimension + cur_k + d] *
score[shift_score_notcon + v];
}
out[shift_out_con + cur_k + d] = sum_con;
out[shift_out_notcon + cur_k + d] = sum_notcon;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GTEInsideGradients(__global float *qkv, __global float *qkv_g,
__global float *scores,
__global float *gradient)
{
//--- init
const uint u = get_global_id(0);
const uint d = get_global_id(1);
const uint h = get_global_id(2);
const uint units = get_global_size(0);
const uint dimension = get_global_size(1);
const uint heads = get_global_size(2);
//--- Calculating Value's gradients
{
int shift_out_con = dimension * h * units + d;
int shift_out_notcon = dimension * units * (h + heads) + d;
int shift_score_con = units * h + u;
int shift_score_notcon = units * (heads + h) + u;
int step_score = units * 2 * heads;
int shift_v = dimension * (h * units + 2 * heads * units + u) + d;
//---
float sum = 0;
//---
for(uint i = 0; i <= units; i++)
{
sum += gradient[shift_out_con + i * dimension] *
scores[shift_score_con + i * step_score];
sum += gradient[shift_out_notcon + i * dimension] *
scores[shift_score_notcon + i * step_score];
}
qkv_g[shift_v] = sum;
}
//--- Calculating Query's gradients
{
int shift_q = dimension * (u + h * units) + d;
int shift_out_con = dimension * (h * units + u) + d;
int shift_out_notcon = dimension * (u + units * (h + heads)) + d;
int shift_score_con = units * h;
int shift_score_notcon = units * (heads + h);
int shift_v = dimension * (h * units + 2 * heads * units);
float grad = 0;
//---
for(int k = 0; k < units; k++)
{
int shift_k = (k + h * units + heads * units) + d;
float sc_g = 0;
float sc_con = scores[shift_score_con + k];
float sc_notcon = scores[shift_score_notcon + k];
for(int v = 0; v < units; v++)
for(int dim = 0; dim < dimension; dim++)
{
sc_g += scores[shift_score_con + v] *
qkv[shift_v + v * dimension + dim] *
gradient[shift_out_con + dim] * ((float)(k == v) - sc_con);
sc_g += scores[shift_score_notcon + v] *
qkv[shift_v + v * dimension + dim] *
gradient[shift_out_notcon + dim] *
((float)(k == v) - sc_notcon);
}
grad += sc_g * qkv[shift_k];
}
qkv_g[shift_q] = grad;
}
//--- Calculating Key's gradients
{
int shift_k = (u + (h + heads) * units) + d;
int shift_out_con = dimension * h * units + d;
int shift_out_notcon = dimension * units * (h + heads) + d;
int shift_score_con = units * h + u;
int shift_score_notcon = units * (heads + h) + u;
int step_score = units * 2 * heads;
int shift_v = dimension * (h * units + 2 * heads * units);
float grad = 0;
//---
for(int q = 0; q < units; q++)
{
int shift_q = dimension * (q + h * units) + d;
float sc_g = 0;
float sc_con = scores[shift_score_con + u + q * step_score];
float sc_notcon = scores[shift_score_notcon + u + q * step_score];
for(int g = 0; g < units; g++)
{
for(int dim = 0; dim < dimension; dim++)
{
sc_g += scores[shift_score_con + g] *
qkv[shift_v + u * dimension + dim] *
gradient[shift_out_con + g * dimension + dim] *
((float)(u == g) - sc_con);
sc_g += scores[shift_score_notcon + g] *
qkv[shift_v + u * dimension + dim] *
gradient[shift_out_notcon + g * dimension + dim] *
((float)(u == g) - sc_notcon);
}
}
grad += sc_g * qkv[shift_q];
}
qkv_g[shift_k] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardNODEF(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_o, ///<[out] Output tensor
int dimension, ///< input dimension
float step, ///< h
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
int d = get_global_id(0);
int dimension_out = get_global_size(0);
int v = get_global_id(1);
int variables = get_global_size(1);
int i = get_global_id(2);
int lenth = get_global_size(2);
//---
int shift = variables * i + v;
int input_shift = shift * dimension;
int output_shift = shift * dimension_out + d;
int weight_shift = (v * dimension_out + d) * (dimension + 2);
//---
float sum = matrix_w[dimension + 1 + weight_shift] +
matrix_w[dimension + weight_shift] * step;
//---
for(int w = 0; w < dimension; w++)
sum += matrix_w[w + weight_shift] * matrix_i[input_shift + w];
//---
if(isnan(sum))
sum = 0;
//---
matrix_o[output_shift] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardNODEInpK(__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_k1, ///<[in] K1 tensor
__global float *matrix_k2, ///<[in] K2 tensor
__global float *matrix_k3, ///<[in] K3 tensor
__global float *matrix_k4, ///<[in] K4 tensor
__global float *matrix_k5, ///<[in] K5 tensor
__global float *matrix_k6, ///<[in] K6 tensor
__global float *matrix_beta, ///<[in] beta tensor
__global float *matrix_o ///<[out] Output tensor
)
{
int i = get_global_id(0);
//---
float sum = matrix_i[i];
//---
for(int b = 0; b < 6; b++)
{
float beta = matrix_beta[b];
if(beta == 0.0f || isnan(beta))
continue;
//---
float val = 0.0f;
switch(b)
{
case 0:
val = matrix_k1[i];
break;
case 1:
val = matrix_k2[i];
break;
case 2:
val = matrix_k3[i];
break;
case 3:
val = matrix_k4[i];
break;
case 4:
val = matrix_k5[i];
break;
case 5:
val = matrix_k6[i];
break;
}
if(val == 0.0f || isnan(val))
continue;
//---
sum += val * beta;
}
//---
matrix_o[i] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HiddenGradientNODEInpK(__global float *matrix_ig, ///<[in] Inputs tensor
__global float *matrix_k1g, ///<[in] K1 tensor
__global float *matrix_k2g, ///<[in] K2 tensor
__global float *matrix_k3g, ///<[in] K3 tensor
__global float *matrix_k4g, ///<[in] K4 tensor
__global float *matrix_k5g, ///<[in] K5 tensor
__global float *matrix_k6g, ///<[in] K6 tensor
__global float *matrix_beta, ///<[in] beta tensor
__global float *matrix_og ///<[out] Output tensor
)
{
int i = get_global_id(0);
//---
float grad = IsNaNOrInf(matrix_og[i], 0);
matrix_ig[i] = grad;
//---
for(int b = 0; b < 6; b++)
{
float beta = IsNaNOrInf(matrix_beta[b], 0.0f);
//---
float val = IsNaNOrInf(beta * grad, 0.0f);
switch(b)
{
case 0:
matrix_k1g[i] = val;
break;
case 1:
matrix_k2g[i] = val;
break;
case 2:
matrix_k3g[i] = val;
break;
case 3:
matrix_k4g[i] = val;
break;
case 4:
matrix_k5g[i] = val;
break;
case 5:
matrix_k6g[i] = val;
break;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HiddenGradientNODEF(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global float *matrix_g, ///<[in] Gradient tensor
__global float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_ig, ///<[out] Inputs Gradient tensor
int dimension_out, ///< output dimension
int activation ///< Input Activation type (#ENUM_ACTIVATION)
)
{
int d = get_global_id(0);
int dimension = get_global_size(0);
int v = get_global_id(1);
int variables = get_global_size(1);
int i = get_global_id(2);
int lenth = get_global_size(2);
//---
int shift = variables * i + v;
int input_shift = shift * dimension + d;
int output_shift = shift * dimension_out;
int weight_step = (dimension + 2);
int weight_shift = (v * dimension_out) * weight_step + d;
//---
float sum = 0;
//---
for(int k = 0; k < dimension_out; k++)
sum +=
matrix_g[output_shift + k] * matrix_w[weight_shift + k * weight_step];
if(isnan(sum))
sum = 0;
//---
float out = matrix_i[input_shift];
//---
matrix_ig[input_shift] = Deactivation(sum, out, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void NODEF_UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float *matrix_gk1, ///<[in] Tensor of gradients at k1
__global const float *matrix_gk2, ///<[in] Tensor of gradients at k2
__global const float *matrix_gk3, ///<[in] Tensor of gradients at k3
__global const float *matrix_gk4, ///<[in] Tensor of gradients at k4
__global const float *matrix_gk5, ///<[in] Tensor of gradients at k5
__global const float *matrix_gk6, ///<[in] Tensor of gradients at k6
__global const float *matrix_ik1, ///<[in] Inputs tensor
__global const float *matrix_ik2, ///<[in] Inputs tensor
__global const float *matrix_ik3, ///<[in] Inputs tensor
__global const float *matrix_ik4, ///<[in] Inputs tensor
__global const float *matrix_ik5, ///<[in] Inputs tensor
__global const float *matrix_ik6, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
__global const float *alpha, ///< h
const int lenth, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int d_in = get_global_id(0);
const int dimension_in = get_global_size(0);
const int d_out = get_global_id(1);
const int dimension_out = get_global_size(1);
const int v = get_global_id(2);
const int variables = get_global_id(2);
//---
const int weight_shift = (v * dimension_out + d_out) * dimension_in;
const int input_step = variables * (dimension_in - 2);
const int input_shift = v * (dimension_in - 2) + d_in;
const int output_step = variables * dimension_out;
const int output_shift = v * dimension_out + d_out;
//---
float weight = matrix_w[weight_shift];
float g = 0;
//---
for(int i = 0; i < lenth; i++)
{
int shift_g = i * output_step + output_shift;
int shift_i = i * input_step + input_shift;
switch(dimension_in - d_in)
{
case 1:
g += matrix_gk1[shift_g] + matrix_gk2[shift_g] + matrix_gk3[shift_g] +
matrix_gk4[shift_g] + matrix_gk5[shift_g] + matrix_gk6[shift_g];
break;
case 2:
g += matrix_gk1[shift_g] * alpha[0] + matrix_gk2[shift_g] * alpha[1] +
matrix_gk3[shift_g] * alpha[2] + matrix_gk4[shift_g] * alpha[3] +
matrix_gk5[shift_g] * alpha[4] + matrix_gk6[shift_g] * alpha[5];
break;
default:
g += matrix_gk1[shift_g] * matrix_ik1[shift_i] +
matrix_gk2[shift_g] * matrix_ik2[shift_i] +
matrix_gk3[shift_g] * matrix_ik3[shift_i] +
matrix_gk4[shift_g] * matrix_ik4[shift_i] +
matrix_gk5[shift_g] * matrix_ik5[shift_i] +
matrix_gk6[shift_g] * matrix_ik6[shift_i];
break;
}
}
//---
float mt = b1 * matrix_m[weight_shift] + (1 - b1) * g;
float vt = b2 * matrix_v[weight_shift] + (1 - b2) * (g * g);
float delta =
l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
if(fabs(delta) > 0)
matrix_w[weight_shift] =
matrix_w[weight_shift] + delta;
matrix_m[weight_shift] = mt;
matrix_v[weight_shift] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TimeDerivative(__global float *qkv, __global float *dqkv,
int dimension)
{
const size_t pos = get_global_id(0);
const size_t variable = get_global_id(1);
const size_t head = get_global_id(2);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
const size_t heads = get_global_size(2);
//---
const int shift = 3 * heads * variables * dimension;
const int shift_query =
pos * shift + (3 * variable * heads + head) * dimension;
const int shift_key = shift_query + heads * dimension;
//---
//---
for(int i = 0; i < dimension; i++)
{
//--- dQ/dt
{
int count = 0;
float delta = 0;
float value = qkv[shift_query + i];
if(pos > 0)
{
delta = value - qkv[shift_query + i - shift];
count++;
}
if(pos < (total - 1))
{
delta += qkv[shift_query + i + shift] - value;
count++;
}
if(count > 0)
dqkv[shift_query + i] = delta / count;
}
//--- dK/dt
{
int count = 0;
float delta = 0;
float value = qkv[shift_key + i];
if(pos > 0)
{
delta = value - qkv[shift_key + i - shift];
count++;
}
if(pos < (total - 1))
{
delta += qkv[shift_key + i + shift] - value;
count++;
}
if(count > 0)
dqkv[shift_key + i] = delta / count;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HiddenGradientTimeDerivative(__global float *qkv_g,
__global float *dqkv_g,
int dimension)
{
const size_t pos = get_global_id(0);
const size_t variable = get_global_id(1);
const size_t head = get_global_id(2);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
const size_t heads = get_global_size(2);
//---
const int shift = 3 * heads * variables * dimension;
const int shift_query =
pos * shift + (3 * variable * heads + head) * dimension;
const int shift_key = shift_query + heads * dimension;
//---
//---
for(int i = 0; i < dimension; i++)
{
//--- dQ/dt
{
int count = 0;
float grad = 0;
float current = dqkv_g[shift_query + i];
if(pos > 0)
{
grad += current - dqkv_g[shift_query + i - shift];
count++;
}
if(pos < (total - 1))
{
grad += dqkv_g[shift_query + i + shift] - current;
count++;
}
if(count > 0)
grad /= count;
qkv_g[shift_query + i] += grad;
}
//--- dK/dt
{
int count = 0;
float grad = 0;
float current = dqkv_g[shift_key + i];
if(pos > 0)
{
grad += current - dqkv_g[shift_key + i - shift];
count++;
}
if(pos < (total - 1))
{
grad += dqkv_g[shift_key + i + shift] - current;
count++;
}
if(count > 0)
grad /= count;
qkv_g[shift_key + i] += dqkv_g[shift_key + i] + grad;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardContAtt(__global float *qkv, __global float *dqkv,
__global float *score, __global float *out,
int dimension,
int heads)
{
const size_t query = get_global_id(0);
const size_t key = get_global_id(1);
const size_t variable = get_global_id(2);
const size_t queris = get_global_size(0);
const size_t keis = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const uint ls_score = min((uint)keis, (uint)LOCAL_ARRAY_SIZE);
__local float local_score[LOCAL_ARRAY_SIZE];
//---
//---
for(int head = 0; head < heads; head++)
{
const int shift = 3 * heads * variables * dimension;
const int shift_query =
query * shift + (3 * variable * heads + head) * dimension;
const int shift_key =
key * shift + (3 * variable * heads + heads + head) * dimension;
const int shift_out =
dimension * (heads * (query * variables + variable) + head);
int shift_score = keis * (heads * (query * variables + variable) + head) + key;
//--- Score
float scr = 0;
for(int d = 0; d < dimension; d++)
scr += qkv[shift_query + d] * dqkv[shift_key + d] +
qkv[shift_key + d] * dqkv[shift_query + d];
scr = exp(min(scr / sqrt((float)dimension), 30.0f));
score[shift_score] = scr;
BarrierLoc
//---
if(key < ls_score)
{
local_score[key] = scr;
for(int k = ls_score + key; k < keis; k += ls_score)
local_score[key] += score[shift_score + k];
}
BarrierLoc
//---
int count = ls_score;
do
{
count = (count + 1) / 2;
if(key < count)
{
if((key + count) < keis)
{
local_score[key] += local_score[key + count];
local_score[key + count] = 0;
}
}
BarrierLoc
}
while(count > 1);
//---
score[shift_score] /= local_score[0];
BarrierLoc
//---
shift_score -= key;
for(int d = key; d < dimension; d += keis)
{
float sum = 0;
int shift_value = (3 * variable * heads + 2 * heads + head) * dimension + d;
for(int v = 0; v < keis; v++)
sum += qkv[shift_value + v * shift] * score[shift_score + v];
out[shift_out + d] = sum;
}
BarrierLoc
}
//---
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HiddenGradientContAtt(__global float *qkv, __global float *qkv_g,
__global float *dqkv,
__global float *dqkv_g,
__global float *score,
__global float *out_g, const int dimension)
{
const size_t pos = get_global_id(0);
const size_t variable = get_global_id(1);
const size_t head = get_global_id(2);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
const size_t heads = get_global_size(2);
//--- Value gradient
{
const int shift_value =
dimension * (heads * (3 * variables * pos + 3 * variable + 2) + head);
const int shift_out = dimension * (head + variable * heads);
const int shift_score = total * (variable * heads + head);
const int step_out = variables * heads * dimension;
const int step_score = variables * heads * total;
//---
//---
for(int d = 0; d < dimension; d++)
{
float sum = 0;
for(int g = 0; g < total; g++)
sum += out_g[shift_out + g * step_out + d] *
score[shift_score + g * step_score];
qkv_g[shift_value + d] = sum;
}
}
//--- Query gradient
{
const int shift_out =
dimension * (heads * (pos * variables + variable) + head);
const int step = 3 * variables * heads * dimension;
const int shift_query =
dimension * (3 * heads * variable + head) + pos * step;
const int shift_key = dimension * (heads * (3 * variable + 1) + head);
const int shift_value = dimension * (heads * (3 * variable + 2) + head);
const int shift_score =
total * (heads * (pos * variables + variable) + head);
//--- Score gradient
//---
for(int k = 0; k < total; k++)
{
float score_grad = 0;
float scr = score[shift_score + k];
for(int v = 0; v < total; v++)
{
float grad = 0;
for(int d = 0; d < dimension; d++)
grad += qkv[shift_value + v * step + d] * out_g[shift_out + d];
score_grad += score[shift_score + v] * grad * ((float)(pos == v) - scr);
}
score_grad /= sqrt((float)dimension);
//--- Query gradient
for(int d = 0; d < dimension; d++)
{
if(k == 0)
{
dqkv_g[shift_query + d] = score_grad * qkv[shift_key + k * step + d];
qkv_g[shift_query + d] = score_grad * dqkv[shift_key + k * step + d];
}
else
{
dqkv_g[shift_query + d] += score_grad * qkv[shift_key + k * step + d];
qkv_g[shift_query + d] += score_grad * dqkv[shift_key + k * step + d];
}
}
}
}
//--- Key gradient
{
const int shift_key =
dimension * (heads * (3 * variables * pos + 3 * variable + 1) + head);
const int shift_out = dimension * (head + variable * heads);
const int step_out = variables * heads * dimension;
const int step = 3 * variables * heads * dimension;
const int shift_query = dimension * (3 * heads * variable + head);
const int shift_value =
dimension * (heads * (3 * variable + 2) + head) + pos * step;
const int shift_score = total * (heads * variable + head);
const int step_score = variables * heads * total;
//--- Score gradient
//---
for(int q = 0; q < total; q++)
{
float score_grad = 0;
float scr = score[shift_score + q * step_score];
for(int g = 0; g < total; g++)
{
float grad = 0;
for(int d = 0; d < dimension; d++)
grad += qkv[shift_value + d] * out_g[shift_out + d + g * step_out] / sqrt((float)dimension);
score_grad += score[shift_score + q * step_score + g] * grad * ((float)(q == pos) - scr);
}
//--- Key gradient
for(int d = 0; d < dimension; d++)
{
if(q == 0)
{
dqkv_g[shift_key + d] = qkv[shift_query + q * step + d] * score_grad;
qkv_g[shift_key + d] = score_grad * dqkv[shift_query + q * step + d];
}
else
{
qkv_g[shift_key + d] += score_grad * dqkv[shift_query + q * step + d];
dqkv_g[shift_key + d] += score_grad * qkv[shift_query + q * step + d];
}
}
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RevInFeedForward(__global float *inputs, __global float *options,
__global float *output, int options_size,
int optimization)
{
int n = get_global_id(0);
int shift = (n * (optimization == 0 ? 7 : 9)) % options_size;
//---
float mean = options[shift];
float variance = options[shift + 1];
float k = options[shift + 3];
//---
float res = 0;
res = sqrt(variance) * (inputs[n] - options[shift + 4]) / fmax(k, 0.001f) + mean;
if(isnan(res))
res = 0;
//---
output[n] = res;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RevInHiddenGraddient(__global float *inputs, __global float *inputs_gr,
__global float *options, __global float *output_gr,
int options_size,
int optimization,
int activation)
{
int n = get_global_id(0);
int shift = (n * (optimization == 0 ? 7 : 9)) % options_size;
//---
float variance = options[shift + 1];
float inp = inputs[n];
float k = options[shift + 3];
//---
float res = sqrt(variance) * output_gr[n];
if(fabs(k) > 1)
res /= k;
if(isnan(res))
res = 0;
//---
inputs_gr[n] = Deactivation(res, inp, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void Activation(__global const float *inputs,
__global float *outputs,
const int activation)
{
int n = get_global_id(0);
//---
float res = IsNaNOrInf(inputs[n], 0);
//---
outputs[n] = fActivation(res, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DeActivation(__global const float *inputs, __global float *inputs_gr,
__global const float *output_gr, const int activation)
{
int n = get_global_id(0);
//---
float inp = inputs[n];
float res = IsNaNOrInf(output_gr[n], 0);
//---
inputs_gr[n] = Deactivation(res, inp, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PatchCreate(__global float *inputs,
__global float *weights,
__global float *outputs,
int inputs_total,
int window_in,
int step,
int activation
)
{
const int i = get_global_id(0);
const int w = get_global_id(1);
const int v = get_global_id(2);
const int window_out = get_global_size(1);
const int variables = get_global_size(2);
//---
const int shift_in = i * step * variables + v;
const int shift_out = (i * variables + v) * window_out + w;
const int shift_weights = (window_in + 1) * (v * window_out + w);
//---
float res = weights[shift_weights + window_in];
//---
for(int p = 0; p < window_in; p++)
if((shift_in + p * variables) < inputs_total)
res += inputs[shift_in + p * variables] * weights[shift_weights + p];
if(isnan(res))
res = 0;
//---
outputs[shift_out] = fActivation(res, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PatchHiddenGradient(__global float *inputs,
__global float *inputs_gr,
__global float *weights,
__global float *outputs_gr,
int window_in,
int step,
int window_out,
int outputs_total,
int activation
)
{
const int i = get_global_id(0);
const int v = get_global_id(1);
const int variables = get_global_size(1);
//---
const int w_start = i % step;
const int r_start = max((i - window_in + step) / step, 0);
int total = (window_in - w_start + step - 1) / step;
total = min((i + step) / step, total);
//---
float grad = 0;
//---
for(int p = 0; p < total; p ++)
{
int row = r_start + p;
if(row >= outputs_total)
break;
for(int wo = 0; wo < window_out; wo++)
{
int shift_g = (row * variables + v) * window_out + wo;
int shift_w = v * (window_in + 1) * window_out + w_start + (total - p - 1) * step + wo * (window_in + 1);
grad += outputs_gr[shift_g] * weights[shift_w];
}
}
//---
float inp = inputs[i * variables + v];
//---
inputs_gr[i * variables + v] = Deactivation(grad, inp, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PatchUpdateWeightsAdam(__global float *weights,
__global const float *outputs_gr,
__global const float *inputs,
__global float *weights_m,
__global float *weights_v,
const int inputs_total,
const float l,
const float b1,
const float b2,
int step
)
{
const int c = get_global_id(0);
const int r = get_global_id(1);
const int v = get_global_id(2);
const int window_in = get_global_size(0) - 1;
const int window_out = get_global_size(1);
const int variables = get_global_size(2);
//---
const int start_input = c * variables + v;
const int step_input = step * variables;
const int start_out = v * window_out + r;
const int step_out = variables * window_out;
const int total = inputs_total / (variables * step);
//---
float grad = 0;
//---
for(int p = 0; p < total; p++)
{
int i = start_input + i * step_input;
int o = start_out + i * step_out;
grad += (c == window_in ? 1 : inputs[i]) * outputs_gr[0];
}
if(isnan(grad))
grad = 0;
//---
const int shift_weights = (window_in + 1) * (window_out * v + r) + c;
//---
float weight = weights[shift_weights];
float mt = b1 * weights_m[shift_weights] + (1 - b1) * grad;
float vt = b2 * weights_v[shift_weights] + (1 - b2) * (grad * grad);
float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
if(fabs(delta) > 0)
weights[shift_weights] = weight + delta;
weights_m[shift_weights] = mt;
weights_v[shift_weights] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MatMult(__global const float *matr1,
__global const float *matr2,
__global float *result,
int dimension,
int multvarsecond)
{
size_t row = get_global_id(0);
size_t col = get_global_id(1);
size_t var = get_global_id(2);
size_t rows = get_global_size(0);
size_t cols = get_global_size(1);
//---
int shift1 = RCtoFlat(row, 0, rows, dimension, var);
int shift2 = RCtoFlat(0, col, dimension, cols, multvarsecond * var);
int shift_out = RCtoFlat(row, col, rows, cols, var);
//---
float res = 0;
//---
for(int i = 0; i < dimension; i++)
res += IsNaNOrInf(matr1[shift1 + i] * matr2[shift2 + i * cols], 0);
//---
result[shift_out] = res;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MatMultGrad(__global const float *matr1,
__global float *matr1_gr,
__global const float *matr2,
__global float *matr2_gr,
__global const float *result_gr,
int dimension,
int multvarsecond)
{
size_t row = get_global_id(0);
size_t col = get_global_id(1);
size_t var = get_global_id(2);
size_t rows = get_global_size(0);
size_t cols = get_global_size(1);
//---
int shift1 = (row + var * rows) * dimension;
int shift2 = var * dimension * cols * multvarsecond;
int shift_out = (row + var * rows) * cols;
//---
for(int c = 0; c < dimension; c += cols)
{
if((c + col) >= dimension)
continue;
float grad = 0;
for(int i = 0; i < cols; i++)
grad += IsNaNOrInf(result_gr[shift_out + i] * matr2[shift2 + c * cols + i], 0);
matr1_gr[shift1 + c] = IsNaNOrInf(grad, 0);
}
//---
shift_out = var * rows * cols + col;
//---
for(int r = 0; r < dimension; r += rows)
{
if((r + row) >= dimension)
continue;
shift1 = var * rows * dimension + r;
float grad = 0;
for(int i = 0; i < rows; i++)
grad += IsNaNOrInf(result_gr[shift_out + i * cols] * matr1[shift1 + i * dimension], 0);
matr2_gr[shift2 + col + r * cols] = IsNaNOrInf(grad, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FFT(__global float *inputs_re,
__global float *inputs_im,
__global float *outputs_re,
__global float *outputs_im,
const int input_window,
const int input_complex,
const int output_window,
const int reverse
)
{
size_t variable = get_global_id(0);
//---
const ulong N = output_window;
const ulong N2 = N / 2;
const ulong inp_shift = input_window * variable;
const ulong out_shift = output_window * variable;
//---
uint target = 0;
//---
for(uint position = 0; position < N; position++)
{
if(target > position)
{
outputs_re[out_shift + position] = (target < input_window ? inputs_re[inp_shift + target] : 0);
outputs_im[out_shift + position] = ((target < input_window && input_complex) ? inputs_im[inp_shift + target] : 0);
outputs_re[out_shift + target] = inputs_re[inp_shift + position];
outputs_im[out_shift + target] = (input_complex ? inputs_im[inp_shift + position] : 0);
}
else
{
outputs_re[out_shift + position] = inputs_re[inp_shift + position];
outputs_im[out_shift + position] = (input_complex ? inputs_im[inp_shift + position] : 0);
}
unsigned int mask = N;
while(target & (mask >>= 1))
target &= ~mask;
target |= mask;
}
float real = 0, imag = 0;
//---
for(int len = 2; len <= (int)N; len <<= 1)
{
float w_real = (float)cos(2 * M_PI_F / len);
float w_imag = (float)sin(2 * M_PI_F / len);
for(int i = 0; i < (int)N; i += len)
{
float cur_w_real = 1;
float cur_w_imag = 0;
for(int j = 0; j < len / 2; j++)
{
real = cur_w_real * outputs_re[out_shift + i + j + len / 2] - cur_w_imag * outputs_im[out_shift + i + j + len / 2];
imag = cur_w_imag * outputs_re[out_shift + i + j + len / 2] + cur_w_real * outputs_im[out_shift + i + j + len / 2];
outputs_re[out_shift + i + j + len / 2] = outputs_re[out_shift + i + j] - real;
outputs_im[out_shift + i + j + len / 2] = outputs_im[out_shift + i + j] - imag;
outputs_re[out_shift + i + j] += real;
outputs_im[out_shift + i + j] += imag;
real = cur_w_real * w_real - cur_w_imag * w_imag;
cur_w_imag = cur_w_imag * w_real + cur_w_real * w_imag;
cur_w_real = real;
}
}
}
//---
if(reverse)
{
outputs_re[0] /= N;
outputs_im[0] /= N;
outputs_re[N2] /= N;
outputs_im[N2] /= N;
//---
for(int i = 1; i < N2; i++)
{
real = outputs_re[i] / N;
imag = outputs_im[i] / N;
outputs_re[i] = outputs_re[N - i] / N;
outputs_im[i] = outputs_im[N - i] / N;
outputs_re[N - i] = real;
outputs_im[N - i] = imag;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexLayer(__global float *inputs_re,
__global float *inputs_im,
__global float *outputs_re,
__global float *outputs_im
)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t total_i = get_global_size(0);
size_t total_j = get_global_size(1);
uint shift = i * total_j + j;
//---
outputs_re[shift] = inputs_re[shift] - inputs_im[shift];
outputs_im[shift] = inputs_im[shift] + inputs_re[shift];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexLayerGradient(__global float *inputs_re,
__global float *inputs_im,
__global float *outputs_re,
__global float *outputs_im
)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t total_i = get_global_size(0);
size_t total_j = get_global_size(1);
uint shift = i * total_j + j;
//---
inputs_re[shift] = outputs_re[shift] + outputs_im[shift];
inputs_im[shift] = outputs_im[shift] - outputs_re[shift];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GradientMSA(__global float *matrix_t, ///<[in] Target tensor
__global float *matrix_o, ///<[in] Forecast tensor
__global float *matrix_g ///<[out] Tensor of gradients
)
{
int i = get_global_id(0);
matrix_g[i] = matrix_t[i] - matrix_o[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CumulativeGradient(__global float *gradient1,
__global float *gradient2,
__global float *gradient_out,
float alpha
)
{
int i = get_global_id(0);
gradient_out[i] = alpha * gradient1[i] + (1 - alpha) * gradient2[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 ComplexMul(const float2 a, const float2 b)
{
float2 result = 0;
result.x = IsNaNOrInf(a.x * b.x - a.y * b.y, 0);
result.y = IsNaNOrInf(a.x * b.y + a.y * b.x, 0);
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 ComplexDiv(const float2 a, const float2 b)
{
float2 result = 0;
float z = IsNaNOrInf(b.x * b.x + b.y * b.y, 1);
if(z > 0)
{
result.x = IsNaNOrInf(a.x * b.x + a.y * b.y, 0) / z;
result.y = IsNaNOrInf(a.y * b.x - a.x * b.y, 0) / z;
}
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float ComplexAbs(float2 a)
{
return sqrt(IsNaNOrInf(a.x * a.x + a.y * a.y, 0));
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 ComplexSqrt(float2 a)
{
float2 result = 0;
float z = ComplexAbs(a);
result.x = sqrt((z + IsNaNOrInf(a.x, 0)) / 2);
result.y = sqrt((z - IsNaNOrInf(a.x, 0)) / 2);
if(a.y < 0)
result.y *= (-1);
//---
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 ComplexExp(float2 a)
{
float2 result = exp(clamp(IsNaNOrInf(a.x, 0), -20.0f, 20.0f));
result.x *= IsNaNOrInf(cos(a.y), 0);
result.y *= IsNaNOrInf(sin(a.y), 0);
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
inline float2 ComplexTanh(float2 a)
{
float sinh_re = sinh(a.x);
float cosh_re = cosh(a.x);
float sin_im = sin(a.y);
float cos_im = cos(a.y);
//---
float2 sinh_a = 0;
float2 cosh_a = 0;
sinh_a.x = sinh_re * cos_im;
sinh_a.y = cosh_re * sin_im;
cosh_a.x = cosh_re * cos_im;
cosh_a.y = sinh_re * sin_im;
//---
return ComplexDiv(sinh_a, cosh_a);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardComplexConv(__global const float2* __attribute__((aligned(8))) matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global const float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor
__global float2* __attribute__((aligned(8))) matrix_o, ///<[out] Output tensor
const int inputs, ///< Number of inputs
const int step, ///< Step size
const int window_in, ///< Size of input window
const int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
const size_t i = get_global_id(0);
const size_t units = get_global_size(0);
const size_t out = get_global_id(1);
const size_t w_out = get_global_size(1);
const size_t var = get_global_id(2);
const size_t variables = get_global_size(2);
//---
int w_in = window_in;
int shift_out = w_out * (i + units * var);
int shift_in = step * i;
int shift = (w_in + 1) * (out + var * w_out);
int stop = (w_in <= (inputs - shift_in) ? w_in : (inputs - shift_in));
shift_in += + inputs * var;
//---
float2 sum = ComplexMul((float2)(1, 0), matrix_w[shift + w_in]);
//---
for(int k = 0; k < stop; k ++)
sum += IsNaNOrInf2(ComplexMul(matrix_i[shift_in + k], matrix_w[shift + k]), (float2)0);
//---
switch(activation)
{
case 0:
sum = ComplexTanh(sum);
break;
case 1:
sum = ComplexDiv((float2)(1, 0), (float2)(1, 0) + ComplexExp(-sum));
break;
case 2:
if(sum.x < 0)
{
sum.x *= 0.01f;
sum.y *= 0.01f;
}
break;
default:
break;
}
matrix_o[out + shift_out] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientComplexConv(__global const float2* __attribute__((aligned(8))) matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input
///< window and n - output window
__global const float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer
__global const float2* __attribute__((aligned(8))) matrix_o, ///<[in] Output tensor
__global float2* __attribute__((aligned(8))) matrix_ig, ///<[out] Tensor of gradients at previous layer
const int outputs, ///< Number of outputs
const int step, ///< Step size
const int window_in, ///< Size of input window
const int window_out, ///< Size of output window
const int activation, ///< Activation type (#ENUM_ACTIVATION)
const int shift_out ///< Shift in output and gradient buffer
)
{
const size_t i = get_global_id(0);
const size_t inputs = get_global_size(0);
const size_t var = get_global_id(1);
const size_t variables = get_global_size(1);
//---
float2 sum = (float2)0;
float2 out = matrix_o[i];
int start = i - window_in + step;
start = max((start - start % step) / step, 0) + var * inputs;
int stop = (i + step - 1) / step;
if(stop > (outputs / window_out))
stop = outputs / window_out;
stop += var * outputs;
//---
for(int h = 0; h < window_out; h ++)
{
for(int k = start; k < stop; k++)
{
int shift_g = k * window_out + h;
int shift_w = (stop - k - 1) * step + i % step + h * (window_in + 1);
if(shift_g >= outputs || shift_w >= (window_in + 1) * window_out)
break;
sum += ComplexMul(matrix_g[shift_out + shift_g], matrix_w[shift_w]);
}
}
sum = IsNaNOrInf2(sum, (float2)0);
//---
switch(activation)
{
case 0:
sum = ComplexMul(sum, (float2)1.0f - ComplexMul(out, out));
break;
case 1:
sum = ComplexMul(sum, ComplexMul(out, (float2)1.0f - out));
break;
case 2:
if(out.x < 0.0f)
{
sum.x *= 0.01f;
sum.y *= 0.01f;
}
break;
default:
break;
}
matrix_ig[i] = clamp(sum, (float2)(-MAX_GRAD), (float2)MAX_GRAD);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsComplexConvMomentum(__global float2* __attribute__((aligned(8))) matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer
__global float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor
__global float2* __attribute__((aligned(8))) matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum, ///< Momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const size_t i = get_global_id(0);
const size_t total_w = get_global_size(0);
const size_t var = get_global_id(1);
const size_t variables = get_global_size(1);
const int shift = i % (window_in + 1);
int shift_out = (i - shift) / (window_in + 1);
int total = (inputs - window_in) % step;
total = (inputs - window_in - total) / step + (total > 0 ? 1 : 0);
shift_out += total * window_out * var;
float2 grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(shift != window_in && (shift + t * window_in) >= inputs)
break;
grad += ComplexMul(matrix_g[t * window_out + shift_out],
(shift == window_in ? (float2)(1, 0) : matrix_i[inputs * var + shift + t * step]));
}
float2 delta = ComplexMul((float2)(learning_rates, 0), clamp(grad, (float2) - MAX_GRAD, (float2)MAX_GRAD)) + ComplexMul((float2)(momentum, 0), matrix_dw[i + total_w * var]);
if(!(isnan(delta.x) || isnan(delta.y) || isinf(delta.x) || isinf(delta.y)))
{
matrix_dw[i + total_w * var] = delta;
matrix_w[i + total_w * var] = matrix_w[i + total_w * var] + delta;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsComplexConvAdam(__global float2* __attribute__((aligned(8))) matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global const float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer
__global const float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor
__global float2* __attribute__((aligned(8))) matrix_m, ///<[in] Matrix of first momentum
__global float2* __attribute__((aligned(8))) matrix_v, ///<[in] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const size_t i = get_global_id(0);
const size_t total_w = get_global_size(0);
const size_t var = get_global_id(1);
const size_t variables = get_global_size(1);
//---
const int shift = i % (window_in + 1);
int shift_out = (i - shift) / (window_in + 1);
int total = (inputs - window_in + step - 1) / step;
shift_out += total * window_out * var;
const int shift_var_in = var * inputs;
const int shift_var_out = var * total * window_out;
//---
float2 grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(shift != window_in && (shift + t * window_in) >= inputs)
break;
grad += IsNaNOrInf2(ComplexMul(matrix_g[t * window_out + shift_out + shift_var_out],
(shift == window_in ? (float2)(1, 0) : matrix_i[shift + t * step + shift_var_in])), (float2)0);
}
grad = clamp(grad, (float2) - MAX_GRAD, (float2)MAX_GRAD);
float2 mt = IsNaNOrInf2(b1 * matrix_m[i + total_w * var] + (1 - b1) * grad, (float2)0);
float2 vt = IsNaNOrInf2(b2 * matrix_v[i + total_w * var] + (1 - b2) * ComplexMul(grad, grad), (float2)(1.0e-6f, 0));
float2 weight = matrix_w[i + total_w * var] + IsNaNOrInf2(l * ComplexDiv(mt, ComplexSqrt(vt)), (float2)0);
matrix_w[i + total_w * var] = weight;
matrix_m[i + total_w * var] = mt;
matrix_v[i + total_w * var] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexSoftMax_FeedForward(__global float2* __attribute__((aligned(8))) inputs,
__global float2* __attribute__((aligned(8))) outputs, const int total)
{
const uint i = (uint)get_global_id(0);
const uint l = (uint)get_local_id(0);
const uint h = (uint)get_global_id(1);
const uint ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE);
uint shift_head = h * total;
//---
__local float2 temp[LOCAL_ARRAY_SIZE];
uint count = 0;
if(l < ls)
do
{
uint shift = shift_head + count * ls + l;
if(shift < ((h + 1) * total))
temp[l].x = (count > 0 ? fmax(ComplexAbs(inputs[shift]), temp[l].x)
: ComplexAbs(inputs[shift]));
count++;
}
while((count * ls + l) < total);
BarrierLoc
float max_value = temp[0].x;
//---
for(int i = 1; i < ls; i++)
max_value = fmax(max_value, temp[i].x);
//---
count = 0;
if(l < ls)
do
{
uint shift = shift_head + count * ls + l;
temp[l] = (count > 0 ? temp[l] : (float2)0) +
(shift < ((h + 1) * total) ? ComplexExp(ComplexDiv(inputs[shift], (float2)(max_value, 0))) : (float2)0);
count++;
}
while((count * ls + l) < total);
BarrierLoc
count = min(ls, (uint)total);
do
{
count = (count + 1) / 2;
if(l < ls)
temp[l] += (l < count && (l + count) < total ? temp[l + count] : (float2)0);
if(l + count < ls)
temp[l + count] = (float2)0;
BarrierLoc
}
while(count > 1);
//---
float2 sum = temp[0];
if(ComplexAbs(sum) > 0)
{
count = 0;
while((count * ls + l) < total)
{
uint shift = shift_head + count * ls + l;
if(shift < ((h + 1) * total))
outputs[shift] = ComplexDiv(ComplexExp(ComplexDiv(inputs[shift], (float2)(max_value, 0))), sum);
count++;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexSoftMax_HiddenGradient(__global float2* __attribute__((aligned(8))) outputs,
__global float2* __attribute__((aligned(8))) output_gr,
__global float2* __attribute__((aligned(8))) input_gr)
{
size_t i = get_global_id(0);
size_t outputs_total = get_global_size(0);
size_t h = get_global_id(1);
uint shift = h * outputs_total;
float2 output = outputs[shift + i];
float2 result = 0;
//---
for(int j = 0; j < outputs_total; j++)
result += ComplexMul(ComplexMul(outputs[shift + j], output_gr[shift + j]), ((i == j ? (float2)(1, 0) : (float2)0) - output));
input_gr[shift + i] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexSoftMax_OutputGradient(__global float2* __attribute__((aligned(8))) outputs,
__global float2* __attribute__((aligned(8))) targets,
__global float2* __attribute__((aligned(8))) output_gr)
{
size_t i = get_global_id(0);
if(ComplexAbs(outputs[i]) == 0)
output_gr[i] = (float2)0;
else
output_gr[i] = ComplexDiv(targets[i], outputs[i]);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexMHAttentionScore(__global float2* __attribute__((aligned(8))) qkv, ///<[in] Matrix of Querys, Keys, Values
__global float2* __attribute__((aligned(8))) score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
int q = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
//---
int shift_q = dimension * (h + 3 * q * heads);
int shift_s = units * (h + q * heads);
//---
float2 koef = (float2)(sqrt((float)dimension), 0);
if(koef.x < 1)
koef.x = 1;
float2 sum = 0;
//---
for(int k = 0; k < units; k++)
{
if(mask > 0 && k > q)
{
score[shift_s + k] = (float2)0;
continue;
}
float2 result = (float2)0;
int shift_k = dimension * (h + heads * (3 * k + 1));
for(int i = 0; i < dimension; i++)
result += ComplexMul(qkv[shift_q + i], qkv[shift_k + i]);
result = ComplexExp(ComplexDiv(result, koef));
if(isnan(result.x) || isnan(result.y) || isinf(result.x) || isinf(result.y))
result = (float2)0;
score[shift_s + k] = result;
sum += result;
}
if(ComplexAbs(sum) > 0)
{
//---
for(int k = 0; k < units; k++)
score[shift_s + k] = ComplexDiv(score[shift_s + k], sum);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexMHAttentionOut(__global float2* __attribute__((aligned(8))) scores, ///<[in] Matrix of Scores
__global float2* __attribute__((aligned(8))) qkv, ///<[in] Matrix of Values
__global float2* __attribute__((aligned(8))) out, ///<[out] Output tensor
int dimension ///< Dimension of Value
)
{
int u = get_global_id(0);
int units = get_global_size(0);
int h = get_global_id(1);
int heads = get_global_size(1);
//---
int shift_s = units * (h + heads * u);
int shift_out = dimension * (h + heads * u);
//---
//---
for(int d = 0; d < dimension; d++)
{
float2 result = (float2)0;
for(int v = 0; v < units; v++)
{
int shift_v = dimension * (h + heads * (3 * v + 2)) + d;
result += ComplexMul(scores[shift_s + v], qkv[shift_v]);
}
out[shift_out + d] = result;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexMHAttentionGradients(__global float2* __attribute__((aligned(8))) qkv,
__global float2* __attribute__((aligned(8))) qkv_g,
__global float2* __attribute__((aligned(8))) scores,
__global float2* __attribute__((aligned(8))) gradient
)
{
size_t u = get_global_id(0);
size_t h = get_global_id(1);
size_t d = get_global_id(2);
size_t units = get_global_size(0);
size_t heads = get_global_size(1);
size_t dimension = get_global_size(2);
//---
float2 koef = (float2)(sqrt((float)dimension), 0);
if(koef.x < 1)
koef.x = 1;
//--- init
const int shift_q = dimension * (heads * 3 * u + h);
const int shift_k = dimension * (heads * (3 * u + 1) + h);
const int shift_v = dimension * (heads * (3 * u + 2) + h);
const int shift_g = dimension * (heads * u + h);
int shift_score = h * units;
int step_score = units * heads;
//--- Calculating Value's gradients
float2 sum = (float2)0;
//---
for(int i = 0; i < units; i++)
sum += ComplexMul(gradient[(h + i * heads) * dimension + d], scores[shift_score + u + i * step_score]);
qkv_g[shift_v + d] = sum;
//--- Calculating Query's gradients
shift_score = h * units + u * step_score;
float2 grad = 0;
float2 grad_out = gradient[shift_g + d];
//---
for(int k = 0; k < units; k++)
{
float2 sc_g = (float2)0;
float2 sc = scores[shift_score + k];
for(int v = 0; v < units; v++)
sc_g += ComplexMul(
ComplexMul(scores[shift_score + v],
ComplexMul(qkv[dimension * (heads * (3 * v + 2) + h)],
grad_out)),
((float2)(k == v, 0) - sc)
);
grad += ComplexMul(ComplexDiv(sc_g, koef), qkv[dimension * (heads * (3 * k + 1) + h) + d]);
}
qkv_g[shift_q + d] = grad;
//--- Calculating Key's gradients
grad = 0;
//---
for(int q = 0; q < units; q++)
{
shift_score = h * units + q * step_score;
float2 sc_g = (float2)0;
float2 sc = scores[shift_score + u];
float2 grad_out = gradient[dimension * (heads * q + h) + d];
for(int v = 0; v < units; v++)
sc_g += ComplexMul(
ComplexMul(scores[shift_score + v],
ComplexMul(qkv[dimension * (heads * (3 * v + 2) + h)],
grad_out)),
((float2)(u == v, 0) - sc)
);
grad += ComplexMul(ComplexDiv(sc_g, koef), qkv[dimension * (heads * 3 * q + h) + d]);
}
qkv_g[shift_k + d] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexNormalize(__global float2* __attribute__((aligned(8))) inputs,
__global float2* __attribute__((aligned(8))) outputs,
__global float2* __attribute__((aligned(8))) means,
__global float *vars,
int dimension)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
const float2 dim = (float2)(dimension, 0);
//---
float2 mean = 0;
//---
for(int i = 0; i < dimension; i++)
mean = IsNaNOrInf2(inputs[shift + i], (float2)0);
means[n] = mean = ComplexDiv(mean, dim);
float variance = 0;
//---
for(int i = 0; i < dimension; i++)
{
float abs_delta = ComplexAbs(inputs[shift + i] - mean);
variance += abs_delta * abs_delta;
}
vars[n] = variance = sqrt(IsNaNOrInf(variance / dimension, 1));
float2 v = (float2)(variance, 0);
//---
for(int i = 0; i < dimension; i++)
{
float2 val = IsNaNOrInf2(ComplexDiv((inputs[shift + i] - mean), v), (float2)0);
outputs[shift + i] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexNormalizeGradient(__global float2* __attribute__((aligned(8))) inputs_gr,
__global float2* __attribute__((aligned(8))) outputs_gr,
__global float *vars,
int dimension)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float v = vars[n];
float2 variance = (float2)((v > 0 ? v : 1.0f), 0);
//---
for(int i = 0; i < dimension; i++)
{
float2 val = ComplexDiv(outputs_gr[shift + i], variance);
if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y))
val = (float2)0;
inputs_gr[shift + i] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexUnNormalize(__global float2* __attribute__((aligned(8))) inputs,
__global float2* __attribute__((aligned(8))) outputs,
__global float2* __attribute__((aligned(8))) means,
__global float *vars,
int dimension)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float v = vars[n];
float2 variance = (float2)((v > 0 ? v : 1.0f), 0);
float2 mean = means[n];
//---
for(int i = 0; i < dimension; i++)
{
float2 val = ComplexMul(inputs[shift + i], variance) + mean;
if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y))
val = (float2)0;
outputs[shift + i] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ComplexUnNormalizeGradient(__global float2* __attribute__((aligned(8))) inputs_gr,
__global float2* __attribute__((aligned(8))) outputs_gr,
__global float *vars,
int dimension
)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float v = vars[n];
float2 variance = (float2)((v > 0 ? v : 1.0f), 0);
//---
for(int i = 0; i < dimension; i++)
{
float2 val = ComplexMul(outputs_gr[shift + i], variance);
if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y))
val = (float2)0;
inputs_gr[shift + i] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MainFreqWeight(__global float2* __attribute__((aligned(8))) freq,
__global float *weight,
int dimension
)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float max_f = 0;
float total = 0;
float energy;
//---
for(int i = 0; i < dimension; i++)
{
energy = ComplexAbs(freq[shift + i]);
total += energy;
max_f = fmax(max_f, energy);
}
weight[n] = max_f / (total > 0 ? total : 1);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void WeightedSum(__global float *inputs1,
__global float *inputs2,
__global float *outputs,
__global float *weight,
int dimension
)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float w = weight[n];
//---
for(int i = 0; i < dimension; i++)
outputs[shift + i] = inputs1[shift + i] * w + inputs2[shift + i] * (1 - w);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void WeightedSumGradient(__global float *inputs_gr1,
__global float *inputs_gr2,
__global float *outputs_gr,
__global float *weight,
int dimension
)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float w = weight[n];
float w1 = 1 - weight[n];
//---
for(int i = 0; i < dimension; i++)
{
float grad = outputs_gr[shift + i];
inputs_gr1[shift + i] = grad * w;
inputs_gr2[shift + i] = grad * w1;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardS3(__global float *inputs,
__global float *probability,
__global float *weights,
__global float *outputs,
__global float *positions,
const int window,
const int total
)
{
int pos = get_global_id(0);
int segments = get_global_size(0);
//---
if((segments * window) > total)
segments--;
//---
int segment = 0;
if(pos < segments)
{
const float prob = probability[pos];
//---
for(int i = 0; i < pos; i++)
{
if(probability[i] <= prob)
segment++;
}
//---
for(int i = pos + 1; i < segments; i++)
{
if(probability[i] < prob)
segment++;
}
}
else
segment = pos;
//---
const int shift_in = segment * window;
const int shift_out = pos * window;
const float w1 = weights[0];
const float w2 = weights[1];
positions[pos] = (float)segment;
//---
for(int i = 0; i < window; i++)
{
if((shift_in + i) >= total || (shift_out + i) >= total)
break;
outputs[shift_out + i] = w1 * inputs[shift_in + i] + w2 * inputs[shift_out + i];
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void InsideGradientS3(__global float *inputs,
__global float *inputs_gr,
__global float *probability,
__global float *probability_gr,
__global float *weights,
__global float *outputs_gr,
__global float *positions,
const int window,
const int total
)
{
size_t pos = get_global_id(0);
//---
int segment = (int)positions[pos];
float prob = probability[pos];
const float w1 = weights[0];
const float w2 = weights[1];
const int shift_in = segment * window;
const int shift_out = pos * window;
//---
float grad = 0;
float temp = 0;
//---
for(int i = 0; i < window; i++)
{
if((shift_out + i) >= total)
break;
temp = outputs_gr[shift_out + i] * w1;
grad += temp * inputs[shift_in + i];
inputs_gr[shift_in + i] = temp + outputs_gr[shift_in + i] * w2;
}
probability_gr[segment] = grad / prob;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void WeightGradientS3(__global float *inputs,
__global float *positions,
__global float *outputs_gr,
__global float *weights_gr,
const int window,
const int total
)
{
size_t l = get_local_id(0);
size_t w = get_global_id(1);
size_t ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
if(l < ls)
{
float val = 0;
//---
for(int i = l; i < total; i += ls)
{
int shift_in = i;
if(w == 0)
{
int pos = i / window;
shift_in = positions[pos] * window + i % window;
}
val += outputs_gr[i] * inputs[shift_in];
}
temp[l] = val;
}
BarrierLoc
//---
int t = ls;
do
{
t = (t + 1) / 2;
if(l < t && (l + t) < ls)
{
temp[l] += temp[l + t];
temp[l + t] = 0;
}
BarrierLoc
}
while(t > 1);
//---
if(l == 0)
weights_gr[w] = temp[0];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MH2PyrAttentionOut(__global float *q,
__global float *kv,
__global float *score,
__global float *out,
const int dimension,
const int heads_kv,
const int window
)
{
//--- init
const int q_id = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_k = dimension * (2 * heads_kv * k + h_kv);
const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k;
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
const int delta_win = (window + 1) / 2;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
float sum = 0;
for(int d = 0; d < dimension; d++)
sum = q[shift_q + d] * kv[shift_k + d];
float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp);
score[shift_s] = sc;
BarrierLoc
//--- out
for(int d = 0; d < dimension; d++)
{
uint count = 0;
if(k < ls)
do
{
if((count * ls) < (kunits - k))
{
sum = 0;
if(abs(count * ls + k - q_id) <= delta_win)
{
int sh_v = 2 * dimension * heads_kv * count * ls;
sum = kv[shift_v + d + sh_v] * (count == 0 ? sc : score[shift_s + count * ls]);
if(isnan(sum))
sum = 0;
}
temp[k] = (count > 0 ? temp[k] : 0) + sum;
}
count++;
}
while((count * ls + k) < kunits);
BarrierLoc
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k < ls)
temp[k] += (k < count && (k + count) < kunits ? temp[k + count] : 0);
if(k + count < ls)
temp[k + count] = 0;
BarrierLoc
}
while(count > 1);
//---
if(k == 0)
out[shift_q + d] = temp[0];
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PLR(__global const float *inputs,
__global float *outputs,
__global int *isttp,
const int transpose,
const float min_step
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//--- constants
const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i));
const int step_in = ((bool)transpose ? variables : 1);
//--- look for ttp
float value = inputs[shift_in];
bool bttp = false;
if(i == 0 || i == lenth - 1)
bttp = true;
else
{
float prev = value;
int prev_pos = i;
float max_v = value;
float max_pos = i;
float min_v = value;
float min_pos = i;
while(fmax(fabs(prev - max_v), fabs(prev - min_v)) < min_step && prev_pos > 0)
{
prev_pos--;
prev = inputs[shift_in - (i - prev_pos) * step_in];
if(prev >= max_v && (prev - min_v) < min_step)
{
max_v = prev;
max_pos = prev_pos;
}
if(prev <= min_v && (max_v - prev) < min_step)
{
min_v = prev;
min_pos = prev_pos;
}
}
//---
float next = value;
int next_pos = i;
while(fmax(fabs(next - max_v), fabs(next - min_v)) < min_step && next_pos < (lenth - 1))
{
next_pos++;
next = inputs[shift_in + (next_pos - i) * step_in];
if(next > max_v && (next - min_v) < min_step)
{
max_v = next;
max_pos = next_pos;
}
if(next < min_v && (max_v - next) < min_step)
{
min_v = next;
min_pos = next_pos;
}
}
//---
if(
(value >= prev && value > next) ||
(value > prev && value == next) ||
(value <= prev && value < next) ||
(value < prev && value == next)
)
if(max_pos == i || min_pos == i)
bttp = true;
}
//---
isttp[shift_in] = (int)bttp;
outputs[shift_in] = 0;
BarrierLoc
//--- calc position
int pos = -1;
int prev_in = 0;
int prev_ttp = 0;
if(bttp)
{
pos = 0;
//---
for(int p = 0; p < i; p++)
{
int current_in = ((bool)transpose ? (p * variables + v) : (v * lenth + p));
if((bool)isttp[current_in])
{
pos++;
prev_ttp = p;
prev_in = current_in;
}
}
}
//--- cacl tendency
if(pos > 0 && pos < (lenth / 3))
{
float sum_x = 0;
float sum_y = 0;
float sum_xy = 0;
float sum_xx = 0;
int dist = i - prev_ttp;
//---
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
float y = inputs[prev_in + p * step_in];
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
float slope = (dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1);
float intercept = (sum_y - slope * sum_x) / dist;
int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3));
outputs[shift_out] = slope;
outputs[shift_out + step_in] = intercept;
outputs[shift_out + 2 * step_in] = ((float)dist) / lenth;
}
else
{
if(pos == (lenth / 3))
{
float sum_x = 0;
float sum_y = 0;
float sum_xy = 0;
float sum_xx = 0;
int dist = lenth - prev_ttp;
//---
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
float y = inputs[prev_in + p * step_in];
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
float slope = (dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1);
float intercept = (sum_y - slope * sum_x) / dist;
int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3));
outputs[shift_out] = slope;
outputs[shift_out + step_in] = intercept;
outputs[shift_out + 2 * step_in] = ((float)dist) / lenth;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PLRGradient(__global float *inputs_gr,
__global const float *outputs,
__global const float *outputs_gr,
const int transpose
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//--- constants
const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i));
const int step_in = ((bool)transpose ? variables : 1);
const int shift_out = ((bool)transpose ? v : (v * lenth));
const int step_out = 3 * step_in;
//--- calc position
int pos = -1;
int prev_in = 0;
int dist = 0;
do
{
pos++;
prev_in += dist;
dist = (int)fmax(outputs[shift_out + pos * step_out + 2 * step_in] * lenth, 1);
}
while(!(prev_in <= i && (prev_in + dist) > i));
//--- calc constants
float sum_x = 0;
float sum_xx = 0;
//---
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
sum_x += x;
sum_xx += x * x;
}
//--- get output gradient
float grad_slope = outputs_gr[shift_out + pos * step_out];
float grad_intercept = outputs_gr[shift_out + pos * step_out + step_in];
//--- calc gradient
grad_slope -= sum_x / dist * grad_intercept;
grad_slope /= fmax(dist * sum_xx - sum_x * sum_x, 1);
float grad = grad_intercept / dist;
grad += (dist * (i - prev_in) - sum_x) * grad_slope;
if(isnan(grad) || isinf(grad))
grad = 0;
//--- save result
inputs_gr[shift_in] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdamMini(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< number of neurons in previous layer and n -
///< number of neurons in current layer
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
//--- inputs
const size_t i = get_local_id(0);
const size_t inputs = get_local_size(0) - 1;
//--- outputs
const size_t o = get_global_id(1);
const size_t outputs = get_global_size(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((uint)LOCAL_ARRAY_SIZE, (uint)inputs);
const float inp = (i < inputs ? matrix_i[i] : 1.0f);
int count = 0;
do
{
if(count == (i / ls))
{
int shift = i % ls;
temp[shift] = (count == 0 ? 0 : temp[shift]) + ((isnan(inp) || isinf(inp)) ? 0 : inp * inp) / inputs;
}
count++;
BarrierLoc
}
while(count * ls < inputs);
//--- sum
count = (ls + 1) / 2;
do
{
if(i < count && (i + count) < ls)
{
temp[i] += temp[i + count];
temp[i + count] = 0;
}
count = (count + 1) / 2;
BarrierLoc
}
while(count > 1);
//--- calc v
if(i == 0)
{
temp[1] = matrix_g[o];
if(isnan(temp[1]) || isinf(temp[1]))
temp[1] = 0;
if(isnan(temp[0]) || isinf(temp[0]))
temp[0] = 1;
float v = matrix_v[o];
if(isnan(v) || isinf(v))
v = 1;
temp[0] = b2 * v + (1 - b2) * (temp[1] * temp[1]) * temp[0];
matrix_v[o] = temp[0];
}
BarrierLoc
//---
const int wi = o * (inputs + 1) + i;
float weight = matrix_w[wi];
if(isnan(weight) || isinf(weight))
weight = 0;
//---
float m = matrix_m[wi];
if(isnan(m) || isinf(m))
m = 0;
//--- calc m
m = b1 * m + (1 - b1) * temp[1] * inp;
if(isnan(m) || isinf(m))
m = 0;
//---
float delta = l * (m / (sqrt(temp[0]) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
if(isnan(delta) || isinf(delta))
delta = 0;
if(delta > 0)
matrix_w[wi] = weight + delta;
matrix_m[wi] = m;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvAdamMini(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in] Matrix of first momentum
__global float *matrix_v, ///<[in] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
int step ///< Step size
)
{
//--- window in
const size_t i = get_global_id(0);
const size_t window_in = get_global_size(0) - 1;
//--- window out
const size_t f = get_global_id(1);
const size_t window_out = get_global_size(1);
//--- head window out
const size_t f_h = get_local_id(1);
const size_t window_out_h = get_local_size(1);
//--- variable
const size_t v = get_global_id(2);
const size_t variables = get_global_size(2);
//--- constants
const int total = (inputs - window_in + step - 1) / step;
const int shift_var_in = v * inputs;
const int shift_var_out = v * total * window_out;
const int shift_w = (f + v * window_out) * (window_in + 1) + i;
//---
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((uint)window_in, (uint)LOCAL_ARRAY_SIZE);
//--- calc gradient
float grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
float gt = matrix_g[t * window_out + f + shift_var_out] *
(i == window_in ? 1 : matrix_i[i + t * step + shift_var_in]);
if(!(isnan(gt) || isinf(gt)))
grad += gt;
}
//--- calc sum grad
int count;
//---
for(int h = 0; h < window_out_h; h++)
{
count = 0;
do
{
if(h == f_h)
{
if(count == (i / ls))
{
int shift = i % ls;
temp[shift] = ((count == 0 && h == 0) ? 0 : temp[shift]) + ((isnan(grad) || isinf(grad)) ? 0 : grad * grad) / (window_in * window_out_h);
}
}
count++;
BarrierLoc
}
while((count * ls) < window_in);
}
count = (ls + 1) / 2;
do
{
if(i < count && (i + count) < ls && f_h == 0)
{
temp[i] += temp[i + count];
temp[i + count] = 0;
}
count = (count + 1) / 2;
BarrierLoc
}
while(count > 1);
//--- calc v
if(i == 0 && f_h == 0)
{
if(isnan(temp[0]) || isinf(temp[0]))
temp[0] = 1;
int head = f / window_out_h;
float v = matrix_v[head];
if(isnan(v) || isinf(v))
v = 1;
temp[0] = clamp(b2 * v + (1 - b2) * temp[0], 1.0e-6f, 1.0e6f);
matrix_v[head] = temp[0];
}
BarrierLoc
//--- calc m
float mt = clamp(b1 * matrix_m[shift_w] + (1 - b1) * grad, -1.0e5f, 1.0e5f);
if(isnan(mt) || isinf(mt))
mt = 0;
float weight = matrix_w[shift_w] + l * mt / sqrt(temp[0]);
if(!(isnan(weight) || isinf(weight)))
matrix_w[shift_w] = weight;
matrix_m[shift_w] = mt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CutTrendAndOther(__global const float *inputs,
__global const float *plr,
__global float *trend,
__global float *other
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//--- constants
const int shift_in = i * variables + v;
const int step_in = variables;
const int shift_plr = v;
const int step_plr = 3 * step_in;
//--- calc position
int pos = -1;
int prev_in = 0;
int dist = 0;
do
{
pos++;
prev_in += dist;
dist = (int)fmax(plr[shift_plr + pos * step_plr + 2 * step_in] * lenth, 1);
}
while(!(prev_in <= i && (prev_in + dist) > i));
//--- calc trend
float sloat = plr[shift_plr + pos * step_plr];
float intercept = plr[shift_plr + pos * step_plr + step_in];
pos = i - prev_in;
float trend_i = sloat * pos + intercept;
float other_i = inputs[shift_in] - trend_i;
//--- save result
trend[shift_in] = trend_i;
other[shift_in] = other_i;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CutTrendAndOtherGradient(__global float *inputs_gr,
__global const float *plr,
__global float *plr_gr,
__global const float *trend_gr,
__global const float *other_gr
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//--- constants
const int shift_in = i * variables + v;
const int step_in = variables;
const int shift_plr = v;
const int step_plr = 3 * step_in;
//--- calc position
int pos = -1;
int prev_in = 0;
int dist = 0;
do
{
pos++;
prev_in += dist;
dist = (int)fmax(plr[shift_plr + pos * step_plr + 2 * step_in] * lenth, 1);
}
while(!(prev_in <= i && (prev_in + dist) > i));
//--- get gradient
float other_i_gr = other_gr[shift_in];
float trend_i_gr = trend_gr[shift_in] - other_i_gr;
//--- calc plr gradient
pos = i - prev_in;
float sloat_gr = trend_i_gr * pos;
float intercept_gr = trend_i_gr;
//--- save result
plr_gr[shift_plr + pos * step_plr] += sloat_gr;
plr_gr[shift_plr + pos * step_plr + step_in] += intercept_gr;
inputs_gr[shift_in] = other_i_gr;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CutOneFromAnother(__global const float *inputs,
__global const float *cut,
__global float *other
)
{
const size_t i = get_global_id(0);
//--- save result
other[i] = inputs[i] - cut[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CutOneFromAnotherGradient(__global float *inputs_gr,
__global float *cut_gr,
__global const float *other_gr
)
{
const size_t i = get_global_id(0);
float gr = other_gr[i];
//--- save result
inputs_gr[i] = gr;
cut_gr[i] = (-gr);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UniTrajPrepare(__global const float *history,
__global const float *h_mask,
__global const float *future,
__global const float *f_mask,
__global float *output,
const int h_total,
const int f_total
)
{
const size_t i = get_global_id(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//---
const int shift_in = i * variables + v;
const int shift_out = 3 * shift_in;
const int shift_f_out = 3 * (h_total * variables + v);
//--- history
if(i < h_total)
{
float mask = h_mask[shift_in];
float h = history[shift_in];
float v = (i < (h_total - 1) && mask != 0 ? (history[shift_in + variables] - h) * mask : 0);
if(isnan(v) || isinf(v))
v = h = mask = 0;
output[shift_out] = h * mask;
output[shift_out + 1] = v;
output[shift_out + 2] = mask;
}
//--- future
if(i < f_total)
{
float mask = f_mask[shift_in];
float f = future[shift_in];
float v = (i < (f_total - 1) && mask != 0 ? (future[shift_in + variables] - f) * mask : 0);
if(isnan(v) || isinf(v))
v = f = mask = 0;
output[shift_f_out + shift_out] = f * mask;
output[shift_f_out + shift_out + 1] = v;
output[shift_f_out + shift_out + 2] = mask;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UniTrajPrepareGrad(__global float *history_gr,
__global float *future_gr,
__global const float *output,
__global const float *output_gr,
const int h_total,
const int f_total
)
{
const size_t i = get_global_id(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//---
const int shift_in = i * variables + v;
const int shift_out = 3 * shift_in;
const int shift_f_out = 3 * (h_total * variables + v);
//--- history
if(i < h_total)
{
float mask = output[shift_out + 2];
float grad = 0;
if(mask > 0)
{
grad = output_gr[shift_out] * mask;
grad -= (i < (h_total - 1) && mask != 0 ? (output_gr[shift_out + 1]) * mask : 0);
grad += (i > 0 ? output[shift_out + 1 - 3 * variables] * output[shift_out + 2 - 3 * variables] : 0);
if(isnan(grad) || isinf(grad))
grad = 0;
//---
}
history_gr[shift_in] = grad;
}
//--- future
if(i < f_total)
{
float mask = output[shift_f_out + shift_out + 2];
float grad = 0;
if(mask > 0)
{
grad = output_gr[shift_f_out + shift_out] * mask;
grad -= (i < (h_total - 1) && mask != 0 ? (output_gr[shift_f_out + shift_out + 1]) * mask : 0);
grad += (i > 0 ? output[shift_f_out + shift_out + 1 - 3 * variables] * output[shift_f_out + shift_out + 2 - 3 * variables] : 0);
if(isnan(grad) || isinf(grad))
grad = 0;
//---
}
future_gr[shift_in] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UniTrajBTS(__global const float *concat_inp,
__global float *d_forw,
__global float *d_bakw,
const int total
)
{
const size_t i = get_global_id(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//---
if(i == 0)
{
const int step = variables * 3;
const int start = v * 3 + 2;
float last = 0;
d_forw[v] = 0;
//---
for(int p = 1; p < total; p++)
{
float m = concat_inp[start + p * step];
d_forw[p * variables + v] = last = 1 + (1 - m) * last;
}
}
else
{
const int step = -(variables * 3);
const int start = (total - 1) * variables + v * 3 + 2;
float last = 0;
d_bakw[(total - 1) + v] = 0;
//---
for(int p = 1; p < total; p++)
{
float m = concat_inp[start + p * step];
d_bakw[(total - 1 - p) * variables + v] = last = 1 + (1 - m) * last;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
float2 Rotate(const float x, const float cos_theta, const float sin_theta)
{
float2 result = 0;
result.s0 = cos_theta + x * sin_theta;
result.s1 = x * cos_theta - sin_theta;
return result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HiVTPrepare(__global const float *data,
__global float2* __attribute__((aligned(8))) output
)
{
const size_t t = get_global_id(0);
const size_t v = get_global_id(1);
const size_t total_v = get_global_size(1);
//---
const int shift_data = t * total_v;
const int shift_out = shift_data * total_v;
//---
float value = data[shift_data + v + total_v] - data[shift_data + v];
const float theta = atan(value);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
const float2 main = Rotate(value, cos_theta, sin_theta);
//---
//---
for(int a = 0; a < total_v; a++)
{
float2 o = main;
if(a != v)
o -= Rotate(data[shift_data + a + total_v] - data[shift_data + a], cos_theta, sin_theta);
output[shift_out + a] = o;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GateElementMul(__global const float *inputs1,
__global const float *inputs2,
__global const float *gate,
__global float *out
)
{
const int i = get_global_id(0);
//---
const float g = IsNaNOrInf(gate[i], 0.5f);
float result = 0;
float inp = IsNaNOrInf(inputs1[i], 0.0f);
if(inp != 0.0f && g != 0.0f)
result += g * inp;
inp = IsNaNOrInf(inputs2[i], 0.0f);
if(inp != 0.0f && (1 - g) != 0.0f)
result += (1 - g) * inp;
//---
out[i] = IsNaNOrInf(result, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GateElementMulGrad(__global const float *inputs1,
__global float *inputs1_gr,
__global const float *inputs2,
__global float *inputs2_gr,
__global const float *gate,
__global float *gate_gr,
__global const float *out_gr,
const int activ1,
const int activ2,
const int activ_gate
)
{
const int i = get_global_id(0);
//---
const float g = IsNaNOrInf(gate[i], 0.5f);
const float i1 = IsNaNOrInf(inputs1[i], 0);
const float i2 = IsNaNOrInf(inputs2[i], 0);
const float grad = IsNaNOrInf(out_gr[i], 0);
//---
float i1_gr = IsNaNOrInf(grad * g, 0);
float i2_gr = IsNaNOrInf(grad * (1 - g), 0);
float g_gr = IsNaNOrInf(grad * (i1 - i2), 0);
//---
inputs1_gr[i] = Deactivation(i1_gr, i1, activ1);
inputs2_gr[i] = Deactivation(i2_gr, i2, activ2);
gate_gr[i] = Deactivation(g_gr, g, activ_gate);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TransposeRCD(__global const float *matrix_in, ///<[in] Input matrix
__global float *matrix_out ///<[out] Output matrix
)
{
const int r = get_global_id(0);
const int c = get_global_id(1);
const int d = get_global_id(2);
const int rows = get_global_size(0);
const int cols = get_global_size(1);
const int dimension = get_global_size(2);
//---
matrix_out[(c * rows + r)*dimension + d] = matrix_in[(r * cols + c) * dimension + d];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void OrthoganalLoss(__global const float *data,
__global float *grad,
const int add
)
{
const size_t r = get_global_id(0);
const size_t c = get_local_id(1);
const size_t cols = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
uint ls = min((uint)cols, (uint)LOCAL_ARRAY_SIZE);
//---
const int shift1 = r * cols + c;
const int shift2 = c * cols + r;
float value1 = IsNaNOrInf(data[shift1], 0);
float value2 = (shift1 == shift2 ? value1 : IsNaNOrInf(data[shift2], 0));
float v2 = IsNaNOrInf(value1 * value2, 0);
//---
for(int i = 0; i < cols; i += ls)
{
//---
if(i <= c && (i + ls) > c)
Temp[c - i] = (i == 0 ? 0 : Temp[c - i]) + v2;
BarrierLoc
}
//---
uint count = min(ls, (uint)cols);
do
{
count = (count + 1) / 2;
if(c < ls)
Temp[c] += (c < count && (c + count) < cols ? Temp[c + count] : 0);
if(c + count < ls)
Temp[c + count] = 0;
BarrierLoc
}
while(count > 1);
//---
const float sum = Temp[0];
float diff = (float)(r == c) - sum;
float loss = -(diff * diff);
float g = (2 * (sum - (float)(r == c))) * loss;
g = 2 * value2 * g;
if(isinf(g) || isnan(g))
g = 0;
if(add == 1)
grad[shift1] += g;
else
grad[shift1] = g;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcDistance(__global const float *data,
__global float *distance,
const int dimension
)
{
const size_t main = get_global_id(0);
const size_t slave = get_local_id(1);
const int total = (int)get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
const int shift_main = main * dimension;
const int shift_slave = slave * dimension;
const int shift_dist = main * total + slave;
//--- calc distance
float dist = 0;
if(main != slave)
{
//---
for(int d = 0; d < dimension; d++)
{
float delta = data[shift_main + d] - data[shift_slave + d];
dist += delta * delta;
}
}
//--- Look Max
float max_dist = LocalMax(dist, 1, Temp);
//--- Normalize
if(max_dist > 0)
dist /= Temp[0];
dist = IsNaNOrInf(dist, 1);
//--- result
distance[shift_dist] = dist;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardLocalMax(__global const float *matrix_i,
__global const float *distance,
__global float *matrix_o,
const float radius
)
{
const size_t i = get_global_id(0);
const size_t total = get_global_size(0);
const size_t d = get_global_id(1);
const size_t dimension = get_global_size(1);
//---
const int shift_dist = i * total;
const int shift_out = i * dimension + d;
//---
float result = -3.402823466e+38f;
//---
for(int k = 0; k < total; k++)
{
if(distance[shift_dist + k] > radius)
continue;
int shift = k * dimension + d;
result = max(result, matrix_i[shift]);
}
matrix_o[shift_out] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcInputGradientLocalMax(__global const float *matrix_i,
__global float *matrix_ig,
__global const float *distance,
__global const float *matrix_o,
__global const float *matrix_g,
const float radius
)
{
const size_t i = get_global_id(0);
const size_t total = get_global_size(0);
const size_t d = get_global_id(1);
const size_t dimension = get_global_size(1);
//---
float result = 0;
float value = matrix_i[i * dimension + d];
//---
for(int k = 0; k < total; k++)
{
if(distance[k * total + i] > radius)
continue;
int shift = k * dimension + d;
if(fabs(matrix_o[shift] - value) <= 1.192092896e-07f)
result += matrix_g[shift];
}
matrix_ig[i * dimension + d] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHMaskAttentionOut(__global const float *q, ///<[in] Matrix of Querys
__global const float *kv, ///<[in] Matrix of Keys
__global float *score, ///<[out] Matrix of Scores
__global const float *mask, ///<[in] Mask Matrix
__global float *out, ///<[out] Matrix of attention
const int dimension, ///< Dimension of Key
const int heads_kv,
const float mask_level
)
{
//--- init
const int q_id = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_local_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_k = dimension * (2 * heads_kv * k + h_kv);
const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k;
const bool b_mask = (mask[shift_s] < mask_level);
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
float sum = 0;
if(b_mask)
sum = MIN_VALUE;
else
for(int d = 0; d < dimension; d++)
sum += q[shift_q + d] * kv[shift_k + d];
float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp);
score[shift_s] = sc;
BarrierLoc
//--- out
for(int d = 0; d < dimension; d++)
{
BarrierLoc
sum = LocalSum(IsNaNOrInf(kv[shift_v + d] * sc, 0), 1, temp);
if(k == 0)
out[shift_q + d] = temp[0];
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHMaskAttentionInsideGradients(__global const float *q, __global float *q_g,
__global const float *kv, __global float *kv_g,
__global const float *mask, __global float *mask_g,
__global const float *scores, __global const float *gradient,
const int kunits, const int heads_kv, const float mask_level)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (q_id * heads + h) * kunits;
const int shift_g = h * dimension + d;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v = q_id; v < kunits; v += qunits)
{
float grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v;
for(int g = 0; g < qunits; g++)
grad += gradient[shift_g + dimension * (hq - h + g * heads)] *
scores[shift_score + g * step_score];
}
int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d;
kv_g[shift_v] = grad;
}
}
//--- Calculating Query's gradients
float grad = 0;
float out_g = gradient[shift_g + q_id * dimension];
int shift_val = (heads_kv + h_kv) * dimension + d;
int shift_key = h_kv * dimension + d;
//---
for(int k = 0; k < kunits; k++)
{
float sc_g = 0;
float sc = scores[shift_s + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] *
((float)(k == v) - sc);
grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension];
}
q_g[shift_q] = grad / koef;
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k = q_id; k < kunits; k += qunits)
{
int shift_k = dimension * (2 * heads_kv * k + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq++)
{
int shift_score = hq * kunits + k;
float val = kv[shift_k + heads_kv * dimension];
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] *
val * ((float)(k == v) - sc);
grad += sc_g * q[shift_q + scr * dimension];
}
}
kv_g[shift_k] = grad / koef;
}
}
//--- Mask's gradient
for(int k = q_id; k < kunits; k += qunits)
{
float m = mask[shift_s + k];
if(m < mask_level)
mask_g[shift_s + k] = 0;
else
mask_g[shift_s + k] = 1 - m;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcPositionBias(__global const float *data1,
__global const float *data2,
__global float *result,
const int dimension
)
{
const size_t idx1 = get_global_id(0);
const size_t idx2 = get_global_id(1);
const size_t total1 = get_global_size(0);
const size_t total2 = get_global_size(1);
//---
const int shift1 = idx1 * dimension;
const int shift2 = idx2 * dimension;
const int shift_out = idx1 * total2 + idx2;
//---
float res = 0;
//---
for(int i = 0; i < dimension; i++)
{
float delta = data1[shift1 + i] - data2[shift2 + i];
res = delta * delta;
}
res = sqrt(res);
res = exp(-res);
if(isnan(res) || isinf(res))
res = 0;
//---
result[shift_out] = res;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHPosBiasAttentionOut(__global const float *q, ///<[in] Matrix of Querys
__global const float *k, ///<[in] Matrix of Keys
__global const float *v, ///<[in] Matrix of Values
__global float *score, ///<[out] Matrix of Scores
__global const float *pos_bias, ///<[in] Position Bias
__global float *out, ///<[out] Matrix of attention
const int dimension, ///< Dimension of Key
const int heads_kv,
const int use_pos_bias
)
{
//--- init
const int q_id = get_global_id(0);
const int k_id = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_kv = dimension * (heads_kv * k_id + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k_id;
const int shift_pb = q_id * kunits + k_id;
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
__local float temp[LOCAL_ARRAY_SIZE];
//--- sum of exp
uint count = 0;
if(k_id < ls)
{
temp[k_id] = 0;
//---
do
{
if(q_id >= (count * ls + k_id))
if((count * ls) < (kunits - k_id))
{
float sum = 0;
int sh_k = dimension * heads_kv * count * ls;
for(int d = 0; d < dimension; d++)
sum = q[shift_q + d] * k[shift_kv + d + sh_k];
sum = exp(sum / koef);
if(isnan(sum))
sum = 0;
temp[k_id] = temp[k_id] + sum + (use_pos_bias > 0 ? pos_bias[shift_pb + count * ls] : 0);
}
count++;
}
while((count * ls + k_id) < kunits);
}
BarrierLoc
count = min(ls, (uint)kunits);
//---
//---
do
{
count = (count + 1) / 2;
if(k_id < ls)
temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0);
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//--- score
float sum = temp[0];
float sc = 0;
if(q_id >= (count * ls + k_id))
if(sum != 0)
{
//---
for(int d = 0; d < dimension; d++)
sc = q[shift_q + d] * k[shift_kv + d];
sc = (exp(sc / koef) + (use_pos_bias > 0 ? pos_bias[shift_pb] : 0)) / sum;
if(isnan(sc))
sc = 0;
}
score[shift_s] = sc;
BarrierLoc
//--- out
//---
for(int d = 0; d < dimension; d++)
{
uint count = 0;
if(k_id < ls)
do
{
if((count * ls) < (kunits - k_id))
{
int sh_v = 2 * dimension * heads_kv * count * ls;
float sum =
v[shift_kv + d + sh_v] * (count == 0 ? sc : score[shift_s + count * ls]);
if(isnan(sum))
sum = 0;
temp[k_id] = (count > 0 ? temp[k_id] : 0) + sum;
}
count++;
}
while((count * ls + k_id) < kunits);
BarrierLoc
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k_id < ls)
temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0);
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//---
out[shift_q + d] = temp[0];
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHPosBiasAttentionInsideGradients(__global const float *q, __global float *q_g,
__global const float *k, __global float *k_g,
__global const float *v, __global float *v_g,
__global const float *scores, __global const float *gradient,
const int kunits, const int heads_kv)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (q_id * heads + h) * kunits;
const int shift_g = h * dimension + d;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v_id = q_id; v_id < kunits; v_id += qunits)
{
float grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v_id;
for(int g = 0; g < qunits; g++)
grad += gradient[shift_g + dimension * (hq - h + g * heads)] *
scores[shift_score + g * step_score];
}
int shift_v = dimension * (heads_kv * v_id + h) + d;
v_g[shift_v] = grad;
}
}
//--- Calculating Query's gradients
float grad = 0;
float out_g = gradient[shift_g + q_id * dimension];
int shift_val = h_kv * dimension + d;
int shift_key = h_kv * dimension + d;
//---
for(int k_id = 0; k_id < kunits; k_id++)
{
float sc_g = 0;
float sc = scores[shift_s + k_id];
if(sc == 0)
continue;
for(int v_id = 0; v_id < kunits; v_id++)
sc_g += scores[shift_s + v_id] * out_g * v[shift_val + v_id * heads_kv * dimension] *
((float)(k_id == v_id) - sc);
grad += sc_g * k[shift_key + k_id * heads_kv * dimension];
}
q_g[shift_q] = grad / koef;
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k_id = q_id; k_id < kunits; k_id += qunits)
{
int shift_k = dimension * (heads_kv * k_id + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + k_id;
float val = v[shift_k];
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k_id];
if(sc == 0)
continue;
for(int v_id = 0; v_id < kunits; v_id++)
sc_g += scores[shift_sc + v_id] * gradient[shift_g + scr * dimension] *
val * ((float)(k_id == v_id) - sc);
grad += sc_g * q[shift_g + scr * heads * dimension];
}
}
k_g[shift_k] = grad / koef;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DiversityLoss(__global const float *data,
__global float *grad,
const int dimension,
const int activation,
const int add
)
{
const size_t main = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t total = get_local_size(0);
const size_t total_loc = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
float delts = 0;
//---
for(int d = 0; d < dimension; d++)
{
const float value_main = IsNaNOrInf(data[main * dimension + d], 0);
for(int i = loc; i < total; i += total_loc)
{
float value_slave = IsNaNOrInf(data[i * dimension + d], 0);
float delta = value_main - value_slave;
delts += IsNaNOrInf(delta * delta / total, 0);
}
}
//---
float loss = exp(LocalSum(delts, 1, Temp));
float gr = 0;
//---
for(int d = 0; d < dimension; d++)
{
const float value_main = IsNaNOrInf(data[main * dimension + d], 0);
for(int i = loc; i < total; i += total_loc)
{
float value_slave = IsNaNOrInf(data[i * dimension + d], 0);
gr += IsNaNOrInf(2 * loss * (value_main - value_slave) / total, 0);
}
//---
gr = LocalSum(gr, 1, Temp);
if(loc == 0)
{
if(add > 0)
grad[main * dimension + d] += Deactivation(gr, value_main, activation);
else
grad[main * dimension + d] = Deactivation(gr, value_main, activation);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHRelativeAttentionOut(__global const float *q, ///<[in] Matrix of Querys
__global const float *k, ///<[in] Matrix of Keys
__global const float *v, ///<[in] Matrix of Values
__global const float *bk, ///<[in] Matrix of Positional Bias Keys
__global const float *bv, ///<[in] Matrix of Positional Bias Values
__global const float *gc, ///<[in] Global content bias vector
__global const float *gp, ///<[in] Global positional bias vector
__global float *score, ///<[out] Matrix of Scores
__global float *out, ///<[out] Matrix of attention
const int dimension ///< Dimension of Key
)
{
//--- init
const int q_id = get_global_id(0);
const int k_id = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_local_size(1);
const int heads = get_global_size(2);
const int shift_q = dimension * (q_id * heads + h);
const int shift_kv = dimension * (heads * k_id + h);
const int shift_gc = dimension * h;
const int shift_s = kunits * (q_id * heads + h) + k_id;
const int shift_pb = q_id * kunits + k_id;
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
float koef = sqrt((float)dimension);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- score
float sc = 0;
//---
for(int d = 0; d < dimension; d++)
{
float val_q = q[shift_q + d];
float val_k = k[shift_kv + d];
float val_bk = bk[shift_kv + d];
sc += val_q * val_k + val_q * val_bk + val_k * val_bk + gc[shift_q + d] * val_k + gp[shift_q + d] * val_bk;
}
sc = sc / koef;
//--- max value
//---
for(int cur_k = 0; cur_k < kunits; cur_k += ls)
{
if(k_id >= cur_k && k_id < (cur_k + ls))
{
int shift_local = k_id % ls;
temp[shift_local] = (cur_k == 0 ? sc : fmax(temp[shift_local], sc));
}
BarrierLoc
}
uint count = min(ls, (uint)kunits);
//---
do
{
count = (count + 1) / 2;
if(k_id < ls)
temp[k_id] = (k_id < count && (k_id + count) < kunits ? fmax(temp[k_id + count], temp[k_id]) : temp[k_id]);
BarrierLoc
}
while(count > 1);
sc = IsNaNOrInf(exp(fmax(sc - temp[0], -120)), 0);
BarrierLoc
//--- sum of exp
//---
for(int cur_k = 0; cur_k < kunits; cur_k += ls)
{
if(k_id >= cur_k && k_id < (cur_k + ls))
{
int shift_local = k_id % ls;
temp[shift_local] = (cur_k == 0 ? 0 : temp[shift_local]) + sc;
}
BarrierLoc
}
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k_id < ls)
temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0);
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//--- score
float sum = IsNaNOrInf(temp[0], 1);
if(fabs(sum) <= 1.2e-7f)
sum = 1;
sc /= sum;
score[shift_s] = sc;
BarrierLoc
//--- out
//---
for(int d = 0; d < dimension; d++)
{
float val_v = v[shift_kv + d];
float val_bv = bv[shift_kv + d];
float val = IsNaNOrInf(sc * (val_v + val_bv), 0);
//--- sum of value
for(int cur_v = 0; cur_v < kunits; cur_v += ls)
{
if(k_id >= cur_v && k_id < (cur_v + ls))
{
int shift_local = k_id % ls;
temp[shift_local] = (cur_v == 0 ? 0 : temp[shift_local]) + val;
}
BarrierLoc
}
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k_id < count && (k_id + count) < kunits)
temp[k_id] += temp[k_id + count];
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//---
if(k_id == 0)
out[shift_q + d] = IsNaNOrInf(temp[0], 0);
BarrierLoc
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHRelativeAttentionInsideGradients(__global const float *q, __global float *q_g,
__global const float *k, __global float *k_g,
__global const float *v, __global float *v_g,
__global const float *bk, __global float *bk_g,
__global const float *bv, __global float *bv_g,
__global const float *gc, __global float *gc_g,
__global const float *gp, __global float *gp_g,
__global const float *scores,
__global const float *gradient,
const int kunits
)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (q_id * heads + h) * kunits;
const int shift_g = h * dimension + d;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating Value's gradients
int step_score = kunits * heads;
//---
for(int v_id = q_id; v_id < kunits; v_id += qunits)
{
float grad = 0;
int shift_score = h * kunits + v_id;
for(int g = 0; g < qunits; g++)
grad += gradient[shift_g + dimension * (g * heads)] *
scores[shift_score + g * step_score];
int shift_v = dimension * (heads * v_id + h) + d;
grad = IsNaNOrInf(grad, 0);
v_g[shift_v] = grad;
bv_g[shift_v] = grad;
}
//--- Calculating Query's gradients
float grad_gc = 0;
float grad_gp = 0;
float out_g = gradient[shift_g + q_id * dimension];
int shift_val = h * dimension + d;
int shift_key = h * dimension + d;
//---
for(int k_id = 0; k_id < kunits; k_id++)
{
float sc_g = 0;
float sc = scores[shift_s + k_id];
if(sc == 0)
continue;
for(int v_id = 0; v_id < kunits; v_id++)
sc_g += scores[shift_s + v_id] * out_g *
(v[shift_val + v_id * heads * dimension] + bv[shift_val + v_id * heads * dimension]) *
((float)(k_id == v_id) - sc);
grad_gc += IsNaNOrInf(sc_g * k[shift_key + k_id * heads * dimension], 0);
grad_gp += IsNaNOrInf(sc_g * bk[shift_key + k_id * heads * dimension], 0);
}
//---
q_g[shift_q] = (grad_gc + grad_gp) / koef;
gc_g[shift_q] = grad_gc / koef;
gp_g[shift_q] = grad_gp / koef;
//--- Calculating Key's gradients
//---
for(int k_id = q_id; k_id < kunits; k_id += qunits)
{
int shift_k = dimension * (heads * k_id + h) + d;
float grad = 0;
float grad_bk = 0;
int shift_score = h * kunits + k_id;
float val = (v[shift_k] + bv[shift_k]);
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k_id];
if(sc == 0)
continue;
for(int v_id = 0; v_id < kunits; v_id++)
sc_g += scores[shift_sc + v_id] * gradient[shift_g + scr * dimension] *
val * ((float)(k_id == v_id) - sc);
float _q = q[shift_g + scr * heads * dimension];
grad += sc_g * (_q + bk[shift_k] + gc[shift_g + scr * heads * dimension]);
grad_bk += sc_g * (_q + k[shift_k] + gp[shift_g + scr * heads * dimension]);
}
k_g[shift_k] = grad / koef;
bk_g[shift_k] = grad_bk / koef;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcAlignmentGradient(__global const float *matrix_o1,
__global const float *matrix_o2,
__global float *matrix_g1,
__global float *matrix_g2,
const int activation,
const int add)
{
int i = get_global_id(0);
const float out1 = IsNaNOrInf(matrix_o1[i], 0);
const float out2 = IsNaNOrInf(matrix_o2[i], 0);
float grad1 = Deactivation(out2 - out1, out1, activation);
float grad2 = Deactivation(out1 - out2, out2, activation);
//---
if(add > 0)
{
matrix_g1[i] += grad1;
matrix_g2[i] += grad2;
}
else
{
matrix_g1[i] = grad1;
matrix_g2[i] = grad2;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeatureSmoothing(__global const float *feature,
__global float *outputs,
const int smoothing
)
{
const size_t pos = get_global_id(0);
const size_t d = get_global_id(1);
const size_t total = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
const int shift_input = pos * dimension + d;
const int shift_output = dimension * pos * smoothing + d;
//---
float value = IsNaNOrInf(feature[shift_input], 0);
outputs[shift_output] = value;
//---
for(int s = 1; s <= smoothing; s++)
{
if((pos - s) >= 0)
value += IsNaNOrInf(feature[shift_input - s * dimension], 0);
if((pos + s) < total)
value += IsNaNOrInf(feature[shift_input + s * dimension], 0);
float factor = IsNaNOrInf(1.0f / (min((int)total, (int)(pos + s)) - max((int)(pos - s), 0) + 1), 0);
float out = IsNaNOrInf(value * factor, 0);
outputs[shift_output + s * dimension] = out;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeatureSmoothingGradient(__global float *feature_gr,
__global const float *outputs_gr,
const int smoothing
)
{
const size_t pos = get_global_id(0);
const size_t d = get_global_id(1);
const size_t total = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
const int shift_input = pos * dimension + d;
const int shift_output = dimension * pos * smoothing + d;
const int step_output = dimension * smoothing;
//---
float grad = IsNaNOrInf(outputs_gr[shift_output], 0);
//---
for(int s = 1; s <= smoothing; s++)
{
int shift = shift_output + s * dimension;
float factor = 1.0f / (min((int)total, (int)(pos + s)) - max((int)(pos - s), 0) + 1);
//---
float value = IsNaNOrInf(outputs_gr[shift] * factor, 0);
//---
if((pos - s) >= 0)
grad += IsNaNOrInf(outputs_gr[shift - s * step_output] * factor, 0);
//---
if((pos + s) < total)
grad += IsNaNOrInf(outputs_gr[shift + s * step_output] * factor, 0);
}
//---
feature_gr[shift_input] = IsNaNOrInf(grad, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void BatchFeedForwardAddNoise(__global const float *inputs, __global float *options,
__global const float *noise, __global float *output,
const int batch, const int optimization,
const int activation, const float alpha)
{
if(batch <= 1)
return;
int n = get_global_id(0);
int shift = n * (optimization == 0 ? 7 : 9);
//---
float inp = inputs[n];
float mean = (batch > 1 ? (IsNaNOrInf(options[shift], 0) * ((float)batch - 1.0f) + inp) / ((float)batch) : inp);
float delt = inp - mean;
float variance = IsNaNOrInf(options[shift + 1], 0) * ((float)batch - 1.0f) + delt * delt;
if(batch > 0)
variance /= (float)batch;
float nx = (variance > 0 ? delt / sqrt(variance) : 0);
float noisex = sqrt(alpha) * nx + sqrt(1 - alpha) * fabs(noise[n]) * sign(nx);
//---
float gamma = IsNaNOrInf(options[shift + 3], 0);
if(gamma == 0)
{
options[shift + 3] = 1;
gamma = 1;
}
float betta = IsNaNOrInf(options[shift + 4], 0);
//---
options[shift] = mean;
options[shift + 1] = variance;
options[shift + 2] = nx;
output[n] = fActivation(gamma * noisex + betta, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HyperProjection(__global const float *inputs,
__global float *outputs
)
{
const size_t pos = get_global_id(0);
const size_t d = get_local_id(1);
const size_t total = get_global_size(0);
const size_t dimension = get_local_size(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_in = pos * dimension + d;
const int shift_out = pos * (dimension + 1) + d + 1;
//---
float v = IsNaNOrInf(inputs[shift_in], 0);
//---
float v2 = IsNaNOrInf(v * v, 0);
//---
if(d < ls)
temp[d] = v2;
BarrierLoc
//---
for(int i = ls; i < (int)dimension; i += ls)
{
if(d >= i && d < (i + ls))
temp[d % ls] += v2;
BarrierLoc
}
//---
int count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
//---
outputs[shift_out] = v;
if(d == 0)
{
v = IsNaNOrInf(((float)pos) / ((float)total), 0);
outputs[shift_out - 1] = sqrt(fmax(temp[0] - v * v, 1.2e-07f));
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void HyperProjectionGrad(__global const float *inputs,
__global float *inputs_gr,
__global const float *outputs_gr
)
{
const size_t pos = get_global_id(0);
const size_t d = get_global_id(1);
const size_t total = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
const int shift_in = pos * dimension + d;
const int shift_start_out = pos * (dimension + 1);
const int shift_out = shift_start_out + d + 1;
//---
float v = IsNaNOrInf(inputs[shift_in], 0);
float grad = IsNaNOrInf(outputs_gr[shift_out], 0);
//---
v = IsNaNOrInf(v * outputs_gr[shift_start_out], 0);
//---
inputs_gr[shift_in] = v + grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LogMap(__global const float *features,
__global const float *centroids,
__global const float *curvatures,
__global float *outputs,
__global float *product,
__global float *distance,
__global float *norma
)
{
//--- identify
const size_t f = get_global_id(0);
const size_t cent = get_global_id(1);
const size_t d = get_local_id(2);
const size_t total_f = get_global_size(0);
const size_t total_cent = get_global_size(1);
const size_t dimension = get_local_size(2);
//--- create local array
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE);
//--- calc shifts
const int shift_f = f * dimension + d;
const int shift_out = (f * total_cent + cent) * dimension + d;
const int shift_cent = cent * dimension + d;
const int shift_temporal = f * total_cent + cent;
//--- load inputs
float feature = IsNaNOrInf(features[shift_f], 0);
float centroid = IsNaNOrInf(centroids[shift_cent], 0);
float curv = IsNaNOrInf(curvatures[cent], 1.2e-7f);
//--- dot(features, centroids)
float fc = IsNaNOrInf(feature * centroid, 0);
//---
if(d < ls)
temp[d] = (d > 0 ? fc : -fc);
BarrierLoc
//---
for(int i = ls; i < (int)dimension; i += ls)
{
if(d >= i && d < (i + ls))
temp[d % ls] += fc;
BarrierLoc
}
//---
int count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
float prod = IsNaNOrInf(temp[0], 0);
product[shift_temporal] = prod;
//--- project
float u = IsNaNOrInf(feature + prod * centroid * curv, 0);
//--- norm(u)
float u2 = IsNaNOrInf(u * u, 0);
//---
if(d < ls)
temp[d] = (d > 0 ? u2 : -u2);
BarrierLoc
//---
for(int i = ls; i < (int)dimension; i += ls)
{
if(d >= i && d < (i + ls))
temp[d % ls] += u2;
BarrierLoc
}
//---
count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
float normu = IsNaNOrInf(temp[0], 0);
if(normu <= 0)
normu = 1.0e-7f;
normu = sqrt(normu);
norma[shift_temporal] = normu;
//--- distance features to centroid
float theta = IsNaNOrInf(-prod * curv, 0);
theta = fmax(theta, 1.0f + 1.2e-07f);
float acosh_theta = acosh(theta);
float dist = IsNaNOrInf(sqrt(clamp((acosh_theta * acosh_theta) / curv, 0.0f, 50.0f)), 0);
distance[shift_temporal] = dist;
float proj_u = IsNaNOrInf(dist * u / normu, 0);
//---
if(d < ls)
temp[d] = (d > 0 ? proj_u * centroid : 0);
BarrierLoc
//---
for(int i = ls; i < (int)dimension; i += ls)
{
if(d >= i && d < (i + ls))
temp[d % ls] += proj_u * centroid;
BarrierLoc
}
//---
count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
//---
if(d == 0)
{
proj_u = IsNaNOrInf(temp[0] / centroid, 0);
proj_u = fmax(u, 1.2e-7f);
}
//---
outputs[shift_out] = proj_u;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LogMapGrad(__global const float *features, __global float *features_gr,
__global const float *centroids, __global float *centroids_gr,
__global const float *curvatures, __global float *curvatures_gr,
__global const float *outputs, __global const float *outputs_gr,
__global const float *product,
__global const float *distance,
__global const float *norma
)
{
//--- identify
const size_t f = get_global_id(0);
const size_t cent = get_global_id(1);
const size_t d = get_local_id(2);
const size_t total_f = get_global_size(0);
const size_t total_cent = get_global_size(1);
const size_t dimension = get_local_size(2);
//--- create local array
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE);
//--- calc shifts
const int shift_f = f * dimension + d;
const int shift_out = (f * total_cent + cent) * dimension + d;
const int shift_cent = cent * dimension + d;
const int shift_temporal = f * total_cent + cent;
//--- load inputs
float feature = features[shift_f];
if(isinf(feature) || isnan(feature))
feature = 0;
float centroid = centroids[shift_cent];
if(isinf(centroid) || isnan(centroid))
centroid = 0;
float centroid0 = (d > 0 ? centroids[shift_cent - d] : centroid);
if(isinf(centroid0) || isnan(centroid0) || centroid0 == 0)
centroid0 = 1.2e-7f;
float curv = curvatures[cent];
if(isinf(curv) || isnan(curv))
curv = 1.2e-7f;
float prod = product[shift_temporal];
float dist = distance[shift_temporal];
float normu = norma[shift_temporal];
float u = feature + prod * centroid * curv;
if(isinf(u) || isnan(u))
u = 0;
//---
float grad = outputs_gr[shift_out];
if(isinf(grad) || isnan(grad))
grad = 0;
float grad0 = (d > 0 ? outputs_gr[shift_out - d] : grad);
if(isinf(grad0) || isnan(grad0))
grad0 = 0;
//---
float feature_gr = 0;
float centroid_gr = 0;
float curv_gr = 0;
float prod_gr = 0;
float normu_gr = 0;
float dist_gr = 0;
//---
float proj_u_gr = (d > 0 ? grad + grad0 / centroid0 * centroid : 0);
if(d == 0)
centroid_gr += outputs[shift_out] / centroid * grad;
else
centroid_gr += grad0 / centroid0 * outputs[shift_out];
if(isnan(centroid_gr) || isinf(centroid_gr))
centroid_gr = 0;
//---
dist_gr = u / normu * proj_u_gr;
float u_gr = dist / normu * proj_u_gr;
normu_gr = dist * u / (normu * normu) * proj_u_gr;
//---
if(d < ls)
temp[d] = dist_gr;
BarrierLoc
//---
for(int id = ls; id < (int)dimension; id += ls)
{
if(d >= id && d < (id + ls))
temp[d % ls] += dist_gr;
BarrierLoc
}
//---
int count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
if(isinf(temp[0]) || isnan(temp[0]))
temp[0] = 0;
dist_gr = temp[0];
//---
if(d == 0)
{
float theta = -prod * curv;
float theta_gr = 1.0f / sqrt(curv * (theta * theta - 1)) * dist_gr;
if(isinf(theta_gr) || isnan(theta_gr))
theta_gr = 0;
float acosh_theta = acosh(theta);
curv_gr += -(acosh_theta * acosh_theta) / (2 * sqrt(curv * curv * curv)) * dist_gr;
if(isinf(curv_gr) || isnan(curv_gr))
curv_gr = 0;
temp[0] = -curv * theta_gr;
if(isinf(temp[0]) || isnan(temp[0]))
temp[0] = 0;
curv_gr += -prod * theta_gr;
if(isinf(curv_gr) || isnan(curv_gr))
curv_gr = 0;
}
BarrierLoc
//---
prod_gr += temp[0];
BarrierLoc
//---
if(d < ls)
temp[d] = normu_gr;
BarrierLoc
//---
for(int id = ls; id < (int)dimension; id += ls)
{
if(d >= id && d < (id + ls))
temp[d % ls] += normu_gr;
BarrierLoc
}
//---
count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
normu_gr = temp[0];
if(isinf(normu_gr) || isnan(normu_gr))
normu_gr = 1.2e-7f;
u_gr += u / normu * normu_gr;
if(isnan(u_gr) || isinf(u_gr))
u_gr = 0;
//---
feature_gr += u_gr;
centroid_gr += prod * curv * u_gr;
BarrierLoc
//--- dot (u_gr * centroid)
if(d < ls)
temp[d] = u_gr * centroid;
BarrierLoc
//---
for(int id = ls; id < (int)dimension; id += ls)
{
if(d >= id && d < (id + ls))
temp[d % ls] += u_gr * centroid;
BarrierLoc
}
//---
count = min(ls, (int)dimension);
//---
do
{
count = (count + 1) / 2;
if(d < count)
temp[d] += ((d + count) < dimension ? temp[d + count] : 0);
if(d + count < dimension)
temp[d + count] = 0;
BarrierLoc
}
while(count > 1);
if(d == 0)
{
if(isinf(temp[0]) || isnan(temp[0]))
temp[0] = 0;
prod_gr += temp[0] * curv;
if(isinf(prod_gr) || isnan(prod_gr))
prod_gr = 0;
curv_gr += temp[0] * prod;
if(isinf(curv_gr) || isnan(curv_gr))
curv_gr = 0;
temp[0] = prod_gr;
}
BarrierLoc
//---
prod_gr = temp[0];
feature_gr += prod_gr * centroid * (d > 0 ? 1 : -1);
centroid_gr += prod_gr * feature * (d > 0 ? 1 : -1);
//--- result
features_gr[shift_f] += feature_gr;
centroids_gr[shift_cent] += centroid_gr;
if(f == 0 && d == 0)
curvatures_gr[cent] += curv;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcEpsilonWeights(__global const float *matrix_w,
__global const float *matrix_g,
__global const float *matrix_i,
__global float *matrix_epsw,
const float rho
)
{
const size_t inp = get_local_id(0);
const size_t inputs = get_local_size(0) - 1;
const size_t out = get_global_id(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((int)inputs, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_w = out * (inputs + 1) + inp;
const float w = IsNaNOrInf(matrix_w[shift_w], 0);
float grad = fabs(w) * IsNaNOrInf(matrix_g[out], 0) * (inputs == inp ? 1.0f : IsNaNOrInf(matrix_i[inp], 0));
//---
const int local_shift = inp % ls;
//---
for(int i = 0; i <= inputs; i += ls)
{
if(i <= inp && inp < (i + ls))
temp[local_shift] = (i == 0 ? 0 : temp[local_shift]) + IsNaNOrInf(grad * grad, 0);
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(inp < count)
temp[inp] += ((inp + count) < inputs ? IsNaNOrInf(temp[inp + count], 0) : 0);
if(inp + count < inputs)
temp[inp + count] = 0;
BarrierLoc
}
while(count > 1);
//---
float norm = sqrt(IsNaNOrInf(temp[0], 0));
float epsw = IsNaNOrInf(w * w * grad * rho / (norm + 1.2e-7f), w);
//---
matrix_epsw[shift_w] = epsw;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcEpsilonWeightsConv(__global const float *matrix_w,
__global const float *matrix_g,
__global const float *matrix_i,
__global float *matrix_epsw,
const int inputs,
const float rho,
const int step
)
{
//---
const size_t inp = get_local_id(0);
const size_t window_in = get_local_size(0) - 1;
const size_t out = get_global_id(1);
const size_t window_out = get_global_size(1);
const size_t v = get_global_id(2);
const size_t variables = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
const int ls = min((int)(window_in + 1), (int)LOCAL_ARRAY_SIZE);
//---
const int shift_w = (out + v * window_out) * (window_in + 1) + inp;
const int total = (inputs - window_in + step - 1) / step;
const int shift_out = v * total * window_out + out;
const int shift_in = v * inputs + inp;
const float w = IsNaNOrInf(matrix_w[shift_w], 0);
//---
float grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(inp != window_in && (inp + t * step) >= inputs)
break;
float g = IsNaNOrInf(matrix_g[t * window_out + shift_out], 0);
float i = IsNaNOrInf(inp == window_in ? 1.0f : matrix_i[t * step + shift_in], 0);
grad += IsNaNOrInf(g * i, 0);
}
grad *= fabs(w);
//---
const int local_shift = inp % ls;
//---
for(int i = 0; i <= inputs; i += ls)
{
if(i <= inp && inp < (i + ls))
temp[local_shift] = (i == 0 ? 0 : temp[local_shift]) + IsNaNOrInf(grad * grad, 0);
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(inp < count && (inp + count) < inputs)
{
temp[inp] += IsNaNOrInf(temp[inp + count], 0);
temp[inp + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
float norm = sqrt(IsNaNOrInf(temp[0], 0));
float epsw = IsNaNOrInf(w * w * grad * rho / (norm + 1.2e-7f), w);
//---
matrix_epsw[shift_w] = epsw;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PLRMultiAgents(__global const float *inputs,
__global float *outputs,
__global int *isttp,
const int transpose,
__global const float *min_step
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
const size_t a = get_global_id(2);
const size_t agents = get_global_size(2);
//--- constants
const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i));
const int step_in = ((bool)transpose ? variables : 1);
const int shift_ag = a * lenth * variables;
//--- look for ttp
float value = IsNaNOrInf(inputs[shift_in], 0);
bool bttp = false;
if(i == 0 || i == lenth - 1)
bttp = true;
else
{
float prev = value;
int prev_pos = i;
float max_v = value;
float max_pos = i;
float min_v = value;
float min_pos = i;
while(fmax(fabs(prev - max_v), fabs(prev - min_v)) < min_step[a] && prev_pos > 0)
{
prev_pos--;
prev = IsNaNOrInf(inputs[shift_in - (i - prev_pos) * step_in], 0);
if(prev >= max_v && (prev - min_v) < min_step[a])
{
max_v = prev;
max_pos = prev_pos;
}
if(prev <= min_v && (max_v - prev) < min_step[a])
{
min_v = prev;
min_pos = prev_pos;
}
}
//---
float next = value;
int next_pos = i;
while(fmax(fabs(next - max_v), fabs(next - min_v)) < min_step[a] && next_pos < (lenth - 1))
{
next_pos++;
next = IsNaNOrInf(inputs[shift_in + (next_pos - i) * step_in], 0);
if(next > max_v && (next - min_v) < min_step[a])
{
max_v = next;
max_pos = next_pos;
}
if(next < min_v && (max_v - next) < min_step[a])
{
min_v = next;
min_pos = next_pos;
}
}
//---
if(
(value >= prev && value > next) ||
(value > prev && value == next) ||
(value <= prev && value < next) ||
(value < prev && value == next)
)
if(max_pos == i || min_pos == i)
bttp = true;
}
//---
isttp[shift_in + shift_ag] = (int)bttp;
outputs[shift_in + shift_ag] = 0;
BarrierLoc
//--- calc position
int pos = -1;
int prev_in = 0;
int prev_ttp = 0;
if(bttp)
{
pos = 0;
//---
for(int p = 0; p < i; p++)
{
int current_in = ((bool)transpose ? (p * variables + v) : (v * lenth + p));
if((bool)isttp[current_in + shift_ag])
{
pos++;
prev_ttp = p;
prev_in = current_in;
}
}
}
//--- cacl tendency
if(pos > 0 && pos < (lenth / 3))
{
float sum_x = 0;
float sum_y = 0;
float sum_xy = 0;
float sum_xx = 0;
int dist = i - prev_ttp;
//---
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
float y = IsNaNOrInf(inputs[prev_in + p * step_in], 0);
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
float slope = IsNaNOrInf((dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1), 0);
float intercept = IsNaNOrInf((sum_y - slope * sum_x) / dist, 0);
int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)) + shift_ag;
outputs[shift_out] = slope;
outputs[shift_out + step_in] = intercept;
outputs[shift_out + 2 * step_in] = ((float)dist) / lenth;
}
else
{
if(pos == (lenth / 3))
{
float sum_x = 0;
float sum_y = 0;
float sum_xy = 0;
float sum_xx = 0;
int dist = lenth - prev_ttp;
//---
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
float y = IsNaNOrInf(inputs[prev_in + p * step_in], 0);
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
float slope = IsNaNOrInf((dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1), 0);
float intercept = IsNaNOrInf((sum_y - slope * sum_x) / dist, 0);
int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)) + shift_ag;
outputs[shift_out] = slope;
outputs[shift_out + step_in] = intercept;
outputs[shift_out + 2 * step_in] = IsNaNOrInf((float)dist / lenth, 0);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PLRMultiAgentsGradient(__global float *inputs_gr,
__global const float *outputs,
__global const float *outputs_gr,
const int transpose,
const int agents
)
{
const size_t i = get_global_id(0);
const size_t lenth = get_global_size(0);
const size_t v = get_global_id(1);
const size_t variables = get_global_size(1);
//--- constants
const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i));
const int step_in = ((bool)transpose ? variables : 1);
const int shift_out = ((bool)transpose ? v : (v * lenth));
const int step_out = 3 * step_in;
const int shift_ag = lenth * variables;
//--- Sum gradient
float grad = 0;
//---
for(int a = 0; a < agents; a++)
{
//--- calc position
int pos = -1;
int prev_in = 0;
int dist = 0;
do
{
pos++;
prev_in += dist;
dist = (int)fmax(outputs[shift_out + pos * step_out + 2 * step_in + a * shift_ag] * lenth, 1);
}
while(!(prev_in <= i && (prev_in + dist) > i));
//--- calc constants
float sum_x = 0;
float sum_xx = 0;
for(int p = 0; p < dist; p++)
{
float x = (float)(p);
sum_x += x;
sum_xx += x * x;
}
//--- get output gradient
float grad_slope = IsNaNOrInf(outputs_gr[shift_out + pos * step_out + a * shift_ag], 0);
float grad_intercept = IsNaNOrInf(outputs_gr[shift_out + pos * step_out + step_in + a * shift_ag], 0);
//--- calc gradient
grad_slope -= IsNaNOrInf(sum_x / dist * grad_intercept, 0);
grad_slope /= fmax(IsNaNOrInf(dist * sum_xx - sum_x * sum_x, 0), 1);
grad += IsNaNOrInf(grad_intercept / dist, 0);
grad += IsNaNOrInf((dist * (i - prev_in) - sum_x) * grad_slope, 0);
}
grad = clamp(IsNaNOrInf(grad / agents, 0), -MAX_GRAD, MAX_GRAD);
//--- save result
inputs_gr[shift_in] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMHConv(__global float *matrix_w,
__global float *matrix_i,
__global float *matrix_o,
const int inputs,
const int step,
const int window_in,
const int window_out,
const int activation
)
{
const size_t i = get_global_id(0);
const size_t h = get_global_id(1);
const size_t v = get_global_id(2);
const size_t total = get_global_size(0);
const size_t heads = get_global_size(1);
//---
const int window_in_h = (window_in + heads - 1) / heads;
const int window_out_h = (window_out + heads - 1) / heads;
const int shift_out = window_out * i + window_out_h * h;
const int shift_in = step * i + window_in_h * h;
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * window_out * total;
const int shift_var_w = v * window_out * (window_in_h + 1);
const int shift_w_h = h * window_out_h * (window_in_h + 1);
//---
float sum = 0;
float4 inp, weight;
int stop = (window_in_h <= (inputs - shift_in) ? window_in_h : (inputs - shift_in));
stop = min(stop, (int)(window_in - h * window_in_h));
//---
//---
for(int out = 0; (out < window_out_h && (window_out_h * h + out) < window_out); out++)
{
int shift = (window_in_h + 1) * out + shift_w_h;
for(int k = 0; k <= stop; k += 4)
{
switch(stop - k)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[shift_var_w + shift + window_in_h], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_var_in + shift_in + k], 1, 0, 0);
weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + window_in_h], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_var_in + shift_in + k],
matrix_i[shift_var_in + shift_in + k + 1], 1, 0);
weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1],
matrix_w[shift_var_w + shift + window_in_h], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_var_in + shift_in + k], matrix_i[shift_var_in + shift_in + k + 1],
matrix_i[shift_var_in + shift_in + k + 2], 1);
weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1],
matrix_w[shift_var_w + shift + k + 2], matrix_w[shift_var_w + shift + shift_w_h]);
break;
default:
inp = (float4)(matrix_i[shift_var_in + shift_in + k], matrix_i[shift_var_in + shift_in + k + 1],
matrix_i[shift_var_in + shift_in + k + 2], matrix_i[shift_var_in + shift_in + k + 3]);
weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1],
matrix_w[shift_var_w + shift + k + 2], matrix_w[shift_var_w + shift + k + 3]);
break;
}
sum += IsNaNOrInf(dot(inp, weight), 0);
}
sum = IsNaNOrInf(sum, 0);
//---
matrix_o[shift_var_out + out + shift_out] = fActivation(sum, activation);;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_gr
/// Kernel of the Convolution neuron to transfer gradient
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMHConv(__global float *matrix_w,
__global float *matrix_g,
__global float *matrix_o,
__global float *matrix_ig,
const int outputs,
const int step,
const int window_in,
const int window_out,
const int activation,
const int shift_out,
const int heads
)
{
const size_t i = get_global_id(0);
const size_t inputs = get_global_size(0);
const size_t v = get_global_id(1);
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * outputs;
const int shift_var_w = v * window_out * (window_in + 1);
const int window_in_h = (window_in + heads - 1) / heads;
const int window_out_h = (window_out + heads - 1) / heads;
//---
float sum = 0;
float out = matrix_o[shift_var_in + i];
const int w_start = i % step;
const int start = max((int)((i - window_in + step) / step), 0);
int stop = (w_start + step - 1) / step;
stop = min((int)((i + step - 1) / step + 1), stop) + start;
if(stop > (outputs / window_out))
stop = outputs / window_out;
//---
for(int k = start; k < stop; k++)
{
int head = (k % window_out) / window_out_h;
for(int h = 0; h < window_out_h; h ++)
{
if((head * window_out_h + h) >= window_out)
break;
int shift_g = k * window_out + head * window_out_h + h;
int shift_w = (stop - k - 1) * step + (i % step) / window_in_h +
head * (window_in_h + 1) + h * (window_in_h + 1);
if(shift_g >= outputs || shift_w >= (window_in_h + 1) * window_out)
break;
sum += IsNaNOrInf(matrix_g[shift_out + shift_g + shift_var_out] * matrix_w[shift_w + shift_var_w], 0);
}
}
//---
matrix_ig[shift_var_in + i] = Deactivation(sum, out, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating
/// Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Convolution
/// Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMHConvAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m -
///< input window and n - output window
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tensor
__global float *matrix_m, ///<[in] Matrix of first momentum
__global float *matrix_v, ///<[in] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
const int window_in, ///< Size of input window
const int window_out, ///< Size of output window
const int step, ///< Step size
const int heads
)
{
const size_t i = get_global_id(0);
//---
const int window_in_h = (window_in + heads - 1) / heads;
const int v = i / ((window_in_h + 1) * window_out);
const int shift = i % window_out;
const int shift_out = i / (window_in_h + 1) - v;
const int total = (inputs - window_in + step - 1) / step;
//---
const int shift_var_in = v * inputs;
const int shift_var_out = v * total * window_out;
//---
float grad = 0;
//---
for(int t = 0; t < total; t++)
{
if(shift != window_in_h && (shift + t * window_in) >= inputs)
break;
grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] *
(shift == window_in_h ? 1 : matrix_i[shift + t * step + shift_var_in]), 0);
}
float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0);
float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f);
float weight = matrix_w[i] + l * mt / sqrt(vt);
matrix_w[i] = weight;
matrix_m[i] = mt;
matrix_v[i] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MoreLessEqual(__global const float *input,
__global float *output)
{
const size_t i = get_global_id(0);
const float value = IsNaNOrInf(input[i], 0);
float result = 0;
if(fabs(value) > 1.2e-7f)
{
if(value > 0)
result = 1;
else
result = -1;
}
output[i] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MultiScaleRelativeAttentionOut(__global const float *q, ///<[in] Matrix of Querys
__global const float *k, ///<[in] Matrix of Keys
__global const float *v, ///<[in] Matrix of Values
__global const float *bk, ///<[in] Matrix of Positional Bias Keys
__global const float *bv, ///<[in] Matrix of Positional Bias Values
__global const float *gc, ///<[in] Global content bias vector
__global const float *gp, ///<[in] Global positional bias vector
__global float *score, ///<[out] Matrix of Scores
__global float *out, ///<[out] Matrix of attention
const int dimension ///< Dimension of Key
)
{
//--- init
const uint q_id = get_global_id(0);
const uint k_id = get_local_id(1);
const uint h = get_global_id(2);
const uint qunits = get_global_size(0);
const uint kunits = get_local_size(1);
const uint heads = get_global_size(2);
const int shift_q = dimension * (q_id * heads + h);
const int shift_kv = dimension * (heads * k_id + h);
const int shift_gc = dimension * h;
const int shift_s = kunits * (q_id * heads + h) + k_id;
const int shift_pb = q_id * kunits + k_id;
const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE);
const uint window = max((uint)((kunits + h) / (h + 1)), min((uint)3, kunits));
float koef = sqrt((float)dimension);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- score
float sc = 0;
if(k_id < window)
{
//---
for(int d = 0; d < dimension; d++)
{
float val_q = q[shift_q + d];
float val_k = k[shift_kv + d];
float val_bk = bk[shift_kv + d];
sc += val_q * val_k + val_q * val_bk + val_k * val_bk + gc[shift_q + d] * val_k + gp[shift_q + d] * val_bk;
}
sc = sc / koef;
}
//--- max value
//---
for(int cur_k = 0; cur_k < kunits; cur_k += ls)
{
if(k_id < window)
if(k_id >= cur_k && k_id < (cur_k + ls))
{
int shift_local = k_id % ls;
temp[shift_local] = (cur_k == 0 ? sc : fmax(temp[shift_local], sc));
}
BarrierLoc
}
uint count = min(ls, kunits);
//---
//---
do
{
count = (count + 1) / 2;
if(k_id < (window + 1) / 2)
if(k_id < ls)
temp[k_id] = (k_id < count && (k_id + count) < kunits ? fmax(temp[k_id + count], temp[k_id]) : temp[k_id]);
BarrierLoc
}
while(count > 1);
if(k_id < window)
sc = IsNaNOrInf(exp(fmax(sc - temp[0], -120)), 0);
BarrierLoc
//--- sum of exp
//---
for(int cur_k = 0; cur_k < kunits; cur_k += ls)
{
if(k_id >= cur_k && k_id < (cur_k + ls))
{
int shift_local = k_id % ls;
temp[shift_local] = (cur_k == 0 ? 0 : temp[shift_local]) + sc;
}
BarrierLoc
}
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k_id < count && k_id < (window + 1) / 2)
temp[k_id] += ((k_id + count) < kunits ? temp[k_id + count] : 0);
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//--- score
float sum = IsNaNOrInf(temp[0], 1);
if(sum <= 1.2e-7f)
sum = 1;
sc /= sum;
score[shift_s] = sc;
BarrierLoc
//--- out
int shift_local = k_id % ls;
//---
for(int d = 0; d < dimension; d++)
{
float val_v = v[shift_kv + d];
float val_bv = bv[shift_kv + d];
float val = IsNaNOrInf(sc * (val_v + val_bv), 0);
//--- sum of value
for(int cur_v = 0; cur_v < kunits; cur_v += ls)
{
if(k_id >= cur_v && k_id < (cur_v + ls))
temp[shift_local] = (cur_v == 0 ? 0 : temp[shift_local]) + val;
BarrierLoc
}
//---
count = min(ls, (uint)kunits);
do
{
count = (count + 1) / 2;
if(k_id < count && (k_id + count) < kunits)
temp[k_id] += temp[k_id + count];
if(k_id + count < ls)
temp[k_id + count] = 0;
BarrierLoc
}
while(count > 1);
//---
if(k_id == 0)
out[shift_q + d] = IsNaNOrInf(temp[0], 0);
BarrierLoc
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SSM2D_FeedForward(__global const float *ah,
__global const float *b_time,
__global const float *b_var,
__global const float *px_time,
__global const float *px_var,
__global const float *c_time,
__global const float *c_var,
__global const float *delta_time,
__global const float *delta_var,
__global float *hidden,
__global float *y
)
{
const size_t n = get_local_id(0);
const size_t d = get_global_id(1);
const size_t n_total = get_local_size(0);
const size_t d_total = get_global_size(1);
//--- Hidden state
//---
for(int h = 0; h < 2; h++)
{
float new_h = ah[(2 * n + h) * d_total + d] + ah[(2 * n_total + 2 * n + h) * d_total + d];
if(h == 0)
new_h += b_time[n] * px_time[n * d_total + d];
else
new_h += b_var[n] * px_var[n * d_total + d];
hidden[(h * n_total + n)*d_total + d] = IsNaNOrInf(new_h, 0);
}
BarrierLoc
//--- Output
uint shift_c = n;
uint shift_h1 = d;
uint shift_h2 = shift_h1 + n_total * d_total;
float value = 0;
//---
for(int i = 0; i < n_total; i++)
{
value += IsNaNOrInf(c_time[shift_c] * delta_time[shift_c] * hidden[shift_h1], 0);
value += IsNaNOrInf(c_var[shift_c] * delta_var[shift_c] * hidden[shift_h2], 0);
shift_c += n_total;
shift_h1 += d_total;
shift_h2 += d_total;
}
//---
y[n * d_total + d] = IsNaNOrInf(value, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SSM2D_CalcHiddenGradient(__global const float *ah,
__global float *grad_ah, // Gradient with respect to ah
__global const float *b_time,
__global float *grad_b_time, // Gradient with respect to b_time
__global const float *b_var,
__global float *grad_b_var, // Gradient with respect to b_var
__global const float *px_time,
__global float *grad_px_time, // Gradient with respect to px_time
__global const float *px_var,
__global float *grad_px_var, // Gradient with respect to px_var
__global const float *c_time,
__global float *grad_c_time, // Gradient with respect to c_time
__global const float *c_var,
__global float *grad_c_var, // Gradient with respect to c_var
__global const float *delta_time,
__global float *grad_delta_time, // Gradient with respect to delta_time
__global const float *delta_var,
__global float *grad_delta_var, // Gradient with respect to delta_var
__global const float *hidden,
__global const float *grad_y // Gradient of loss with respect to y
)
{
//---
const size_t n = get_global_id(0);
const size_t d = get_local_id(1);
const size_t n_total = get_global_size(0);
const size_t d_total = get_local_size(1);
//--- Initialize indices for data access
uint shift_c = n;
uint shift_h1 = d;
uint shift_h2 = shift_h1 + n_total * d_total;
float grad_hidden1 = 0;
float grad_hidden2 = 0;
//--- Backpropagation: compute hidden gradients from y
for(int i = 0; i < n_total; i++)
{
float grad = grad_y[i * d_total + d];
float c_t = c_time[shift_c];
float c_v = c_var[shift_c];
float delta_t = delta_time[shift_c];
float delta_v = delta_var[shift_c];
float h1 = hidden[shift_h1];
float h2 = hidden[shift_h2];
//-- Accumulate gradients for hidden states
grad_hidden1 += IsNaNOrInf(grad * c_t * delta_t, 0);
grad_hidden2 += IsNaNOrInf(grad * c_v * delta_v, 0);
//--- Compute gradients for c_time, c_var, delta_time, delta_var
grad_c_time[shift_c] += grad * delta_t * h1;
grad_c_var[shift_c] += grad * delta_v * h2;
grad_delta_time[shift_c] += grad * c_t * h1;
grad_delta_var[shift_c] += grad * c_v * h2;
//--- Update indices for the next element
shift_c += n_total;
shift_h1 += d_total;
shift_h2 += d_total;
}
//--- Backpropagate through hidden -> ah, b_time, px_time
for(int h = 0; h < 2; h++)
{
float grad_h = (h == 0) ? grad_hidden1 : grad_hidden2;
//--- Store gradients in ah (considering its influence on two elements)
grad_ah[(2 * n + h) * d_total + d] = grad_h;
grad_ah[(2 * (n_total + n) + h) * d_total + d] = grad_h;
}
//--- Backpropagate through px_time and px_var (influenced by b_time and b_var)
grad_px_time[n * d_total + d] = grad_hidden1 * b_time[n];
grad_px_var[n * d_total + d] = grad_hidden2 * b_var[n];
if(d == 0)
{
grad_b_time[n] = 0;
grad_b_var[n] = 0;
}
BarrierLoc
//--- Sum gradients over all d for b_time and b_var
grad_b_time[n] += grad_hidden1 * px_time[n * d_total + d];
grad_b_var[n] += grad_hidden2 * px_var[n * d_total + d];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PScan(__global const float* A,
__global const float* X,
__global const float* H,
__global float* X_out)
{
const size_t idx = get_local_id(0);
const size_t dim = get_global_id(1);
const size_t L = get_local_size(0);
const size_t D = get_global_size(1);
const int num_steps = (int)log2((float)L);
//---
__local float local_A[1024];
__local float local_X[1024];
__local float local_H[1024];
//--- Load data to local memory
int offset = dim + idx * D;
local_A[idx] = A[offset];
local_X[idx] = X[offset];
local_H[idx] = H[offset];
BarrierLoc
//--- Scan
//---
for(int step = 0; step < num_steps; step++)
{
int halfT = L >> (step + 1);
if(idx < halfT)
{
int base = idx * 2;
local_X[base + 1] += local_A[base + 1] * local_X[base];
local_X[base + 1] *= local_H[base + 1];
local_A[base + 1] *= local_A[base];
}
BarrierLoc
}
//--- Save result
X_out[offset] = local_X[idx];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PScan_CalcHiddenGradient(__global const float* A,
__global float* grad_A,
__global const float* X,
__global float* grad_X,
__global const float* H,
__global float* grad_H,
__global const float* grad_X_out)
{
const size_t idx = get_local_id(0);
const size_t dim = get_global_id(1);
const size_t L = get_local_size(0);
const size_t D = get_global_size(1);
const int num_steps = (int)log2((float)L);
//---
__local float local_A[1024];
__local float local_X[1024];
__local float local_H[1024];
__local float local_grad_X[1024];
__local float local_grad_A[1024];
__local float local_grad_H[1024];
//--- Load data to local memory
int offset = idx * D + dim;
local_A[idx] = A[offset];
local_X[idx] = X[offset];
local_H[idx] = H[offset];
local_grad_X[idx] = grad_X_out[offset];
local_grad_A[idx] = 0.0f;
local_grad_H[idx] = 0.0f;
BarrierLoc
//--- Reverse Scan (Backward)
//---
for(int step = num_steps - 1; step >= 0; step--)
{
int halfT = L >> (step + 1);
if(idx < halfT)
{
int base = idx * 2;
// Compute gradients
float grad_next = local_grad_X[base + 1] * local_H[base + 1];
local_grad_H[base + 1] = local_grad_X[base + 1] * local_X[base];
local_grad_A[base + 1] = local_grad_X[base + 1] * local_X[base];
local_grad_X[base] += local_A[base + 1] * grad_next;
}
BarrierLoc
}
//--- Save gradients
grad_A[offset] = local_grad_A[idx];
grad_X[offset] = local_grad_X[idx];
grad_H[offset] = local_grad_H[idx];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DiagMatMult(__global const float *diag,
__global const float *matr,
__global float *result,
int activation)
{
size_t row = get_global_id(0);
size_t col = get_local_id(1);
size_t var = get_global_id(2);
size_t rows = get_global_size(0);
size_t cols = get_local_size(1);
//---
__local float local_diag[1];
if(cols == 0)
local_diag[0] = diag[row + var * rows];
BarrierLoc
//---
int shift = (row + var * rows) * cols + col;
//---
float res = local_diag[0] * matr[shift];
//---
result[shift] = fActivation(res, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DiagMatMultGrad(__global const float *diag,
__global float *grad_diag,
__global const float *matr,
__global float *grad_matr,
__global const float *grad_result)
{
size_t row = get_global_id(0);
size_t col = get_local_id(1);
size_t var = get_global_id(2);
size_t rows = get_global_size(0);
size_t cols = get_local_size(1);
size_t vars = get_global_size(2);
//---
__local float local_diag[LOCAL_ARRAY_SIZE];
if(cols == 0)
local_diag[0] = diag[row + var * rows];
BarrierLoc
//---
int shift = (row + var * rows) * cols + col;
//---
float grad = grad_result[shift];
float inp = matr[shift];
//---
grad_matr[shift] = IsNaNOrInf(local_diag[0] * grad, 0);
BarrierLoc
//---
int loc = col % LOCAL_ARRAY_SIZE;
//---
for(int c = 0; c < cols; c += LOCAL_ARRAY_SIZE)
{
if(c <= col && (c + LOCAL_ARRAY_SIZE) > col)
{
if(c == 0)
local_diag[loc] = IsNaNOrInf(grad * inp, 0);
else
local_diag[loc] += IsNaNOrInf(grad * inp, 0);
}
BarrierLoc
}
//---
int count = min(LOCAL_ARRAY_SIZE, (int)cols);
int ls = count;
//---
do
{
count = (count + 1) / 2;
if((col + count) < ls)
{
local_diag[col] += local_diag[col + count];
local_diag[col + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
if(col == 0)
grad_diag[row + var * rows] = IsNaNOrInf(local_diag[0], 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TopKgates(__global const float *inputs,
__global const float *noises,
__global float *gates,
const uint k)
{
size_t idx = get_local_id(0);
size_t var = get_global_id(1);
size_t window = get_local_size(0);
size_t vars = get_global_size(1);
//---
const int shift_logit = var * 2 * window + idx;
const int shift_std = shift_logit + window;
const int shift_gate = var * window + idx;
//---
float logit = IsNaNOrInf(inputs[shift_logit], MIN_VALUE);
float noise = IsNaNOrInf(noises[shift_gate], 0);
if(noise != 0)
{
noise *= fActivation(inputs[shift_std], 3);
logit += IsNaNOrInf(noise, 0);
}
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
const uint ls = min((uint)window, (uint)LOCAL_ARRAY_SIZE);
uint bigger = 0;
float max_logit = logit;
//--- Top K
for(int i = 0; i < window; i += ls)
{
if(idx >= i && idx < (i + ls))
temp[idx % ls] = logit;
BarrierLoc
for(int i1 = 0; (i1 < min((int)ls, (int)(window - i)) && bigger <= k); i1++)
{
if(temp[i1] > logit)
bigger++;
if(temp[i1] > max_logit)
max_logit = temp[i1];
}
BarrierLoc
}
//---
if(bigger <= k)
gates[shift_gate] = logit - max_logit;
else
gates[shift_gate] = MIN_VALUE;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TopKgatesGrad(__global const float *inputs,
__global float *grad_inputs,
__global const float *noises,
__global const float *gates,
__global float *grad_gates)
{
size_t idx = get_global_id(0);
size_t var = get_global_id(1);
size_t window = get_global_size(0);
size_t vars = get_global_size(1);
//---
const int shift_logit = var * 2 * window + idx;
const int shift_std = shift_logit + window;
const int shift_gate = var * window + idx;
//---
float grad = IsNaNOrInf(grad_gates[shift_gate], 0);
grad_inputs[shift_logit] = grad;
//---
float noise = IsNaNOrInf(noises[shift_gate], 0);
if(noise == 0)
{
grad_inputs[shift_std] = 0;
return;
}
//---
grad *= noise;
grad_inputs[shift_std] = Deactivation(grad, fActivation(inputs[shift_std], 3), 3);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskByDistance(__global const float *buf_real,
__global const float *buf_imag,
__global float *mask,
const int dimension
)
{
const size_t main = get_global_id(0);
const size_t slave = get_local_id(1);
const int total = (int)get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
int ls = min((int)total, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_main = main * dimension;
const int shift_slave = slave * dimension;
const int shift_mask = main * total + slave;
//--- calc distance
float dist = 0;
if(main != slave)
{
//---
for(int d = 0; d < dimension; d++)
{
float delta = ComplexAbs((float2)(buf_real[shift_main + d], buf_imag[shift_main + d])) -
ComplexAbs((float2)(buf_real[shift_slave + d], buf_imag[shift_slave + d]));
dist += delta * delta;
}
dist = sqrt(dist);
}
//--- Look Max
//---
for(int i = 0; i < total; i += ls)
{
if(i <= slave && (i + ls) > slave)
Temp[slave % ls] = fmax((i == 0 ? 0 : Temp[slave % ls]), IsNaNOrInf(dist, 0));
BarrierLoc
}
//---
int count = ls;
do
{
count = (count + 1) / 2;
if(slave < count && (slave + count) < ls)
{
if(Temp[slave] < Temp[slave + count])
Temp[slave] = Temp[slave + count];
Temp[slave + count] = 0;
}
BarrierLoc
}
while(count > 1);
//--- Normalize
if(Temp[0] > 0)
dist /= Temp[0];
//--- result
mask[shift_mask] = 1 - IsNaNOrInf(dist, 1);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskAttention(__global const float *q, ///<[in] Matrix of Querys
__global const float *kv, ///<[in] Matrix of Keys
__global float *scores, ///<[out] Matrix of Scores
__global const float *masks, ///<[in] Mask Matrix
__global float *out, ///<[out] Matrix of attention
const int dimension, ///< Dimension of Key
const int heads_kv
)
{
//--- init
const int q_id = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_local_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_k = dimension * (2 * heads_kv * k + h_kv);
const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k;
const float mask = IsNaNOrInf(masks[q_id * kunits + k], 0);
const uint ls = min((uint)kunits, (uint)LOCAL_ARRAY_SIZE);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
float score = 0;
if(mask != 0)
{
for(int d = 0; d < dimension; d++)
score += IsNaNOrInf(q[shift_q + d] * kv[shift_k + d], 0);
score = IsNaNOrInf(exp(score / koef * mask), 0);
}
//--- sum of exp
float sum = LocalSum(score, 1, temp);
//--- score
if(sum > 0)
score /= sum;
scores[shift_s] = score;
//--- out
for(int d = 0; d < dimension; d++)
{
float val = LocalSum(kv[shift_v + d] * score, 1, temp);
if(k == 0)
out[shift_q + d] = val;
BarrierLoc
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskAttentionGradients(__global const float *q, __global float *q_g,
__global const float *kv, __global float *kv_g,
__global const float *scores,
__global const float *gradient,
const int kunits, const int heads_kv
)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (q_id * heads + h) * kunits;
const int shift_g = h * dimension + d;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v = q_id; v < kunits; v += qunits)
{
float grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v;
for(int g = 0; g < qunits; g++)
grad += gradient[shift_g + dimension * (hq - h + g * heads)] *
scores[shift_score + g * step_score];
}
int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d;
kv_g[shift_v] = grad;
}
}
//--- Calculating Query's gradients
float grad = 0;
float out_g = IsNaNOrInf(gradient[shift_g + q_id * dimension], 0);
int shift_val = (heads_kv + h_kv) * dimension + d;
int shift_key = h_kv * dimension + d;
//---
for(int k = 0; (k < kunits && out_g != 0); k++)
{
float sc_g = 0;
float sc = scores[shift_s + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] *
((float)(k == v) - sc);
grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension];
}
q_g[shift_q] = grad / koef;
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k = q_id; k < kunits; k += qunits)
{
int shift_k = dimension * (2 * heads_kv * k + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq++)
{
int shift_score = hq * kunits + k;
float val = kv[shift_k + heads_kv * dimension];
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] *
val * ((float)(k == v) - sc);
grad += sc_g * q[dimension * (h + scr * heads) + d];
}
}
kv_g[shift_k] = grad / koef;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMultWinConv(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_o,
__global const int *windows_in,
const int inputs,
const int windows_total,
const int window_out,
const int activation
)
{
const size_t i = get_global_id(0);
const size_t v = get_global_id(1);
const size_t outputs = get_global_size(0);
//---
const int id = i % (window_out * windows_total);
//---
int step = 0;
int shift_in = 0;
int shift_weight = 0;
int window_in = 0;
int window = 0;
//---
for(int w = 0; w < windows_total; w++)
{
int win = windows_in[w];
step += win;
if(((w + 1) * window_out) < id)
{
shift_in = step;
window_in = win;
shift_weight += (win + 1) * window_out;
}
}
//---
int steps = (int)(i / (window_out * windows_total));
shift_in += steps * step + v * inputs;
shift_weight += (id % window_out) * (window_in + 1);
float sum = matrix_w[shift_weight + window_in];
float inp = 0.0f;
//---
for(int w = 0; w < window_in; w++)
if((shift_in + w) < inputs)
{
inp = IsNaNOrInf(matrix_i[shift_in + w], 0.0f);
if(inp == 0.0f)
continue;
sum += IsNaNOrInf(inp * matrix_w[shift_weight + w], 0.0f);
}
//---
matrix_o[v * outputs + i] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_gr
/// Kernel of the Convolution neuron to transfer gradient
/// to previous layer (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMultWinConv(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_ig,
__global const float *matrix_og,
__global const int *windows_in,
const int outputs,
const int windows_total,
const int window_out,
const int activation
)
{
const size_t i = get_global_id(0);
const size_t v = get_global_id(1);
const size_t inputs = get_global_size(0);
//---
int step = 0;
//---
for(int w = 0; w < windows_total; w++)
step += windows_in[w];
//---
int steps = (int)(i / step);
int id = i % step;
int window = 0;
int before = 0;
int window_in = 0;
//---
for(int w = 0; w < windows_total; w++)
{
window_in = windows_in[w];
if((before + window_in) >= id)
break;
window = w + 1;
before += window_in;
}
//---
int shift_weight = (before + window) * window_out + id - before;
int shift_out = (steps * windows_total + window) * window_out + v * outputs;
float sum = 0;
//---
for(int w = 0; w < window_out; w++)
{
float grad = IsNaNOrInf(matrix_og[shift_out + w], 0.0f);
if(grad == 0.0f)
continue;
sum += IsNaNOrInf(grad * matrix_w[shift_weight + w * (window_in + 1)], 0);
}
//---
matrix_ig[v * inputs + i] = Deactivation(sum, matrix_i[v * inputs + i], activation);
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating
/// Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Convolution
/// Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMultWinConvAdam(__global float *matrix_w,
__global const float *matrix_og,
__global const float *matrix_i,
__global float *matrix_m,
__global float *matrix_v,
__global const int *windows_in,
const int windows_total,
const int window_out,
const int inputs,
const int outputs,
const float l,
const float b1,
const float b2
)
{
const size_t i = get_global_id(0); // weight shift
const size_t v = get_local_id(1); // variable
const size_t variables = get_local_size(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
int step_out = window_out * windows_total;
//---
int step_in = 0;
int shift_in = 0;
int shift_out = 0;
int window = 0;
int number_w = 0;
//---
for(int w = 0; w < windows_total; w++)
{
int win = windows_in[w];
if((step_in + w)*window_out <= i &&
(step_in + win + w + 1)*window_out > i)
{
shift_in = step_in;
shift_out = (step_in + w + 1) * window_out;
window = win;
number_w = w;
}
step_in += win;
}
bool bias = ((i - (shift_in + number_w) * window_out) % (window + 1) == window);
int t = (i - (shift_in + number_w) * window_out) / (window + 1);
shift_out += t + v * outputs;
shift_in += (i - (shift_in + number_w) * window_out) % (window + 1) + v * inputs;
//---
float grad = 0;
int total = (inputs + step_in - 1) / step_in;
//---
for(int t = 0; t < total; t++)
{
int sh_out = t * step_out + shift_out;
if(bias && sh_out < outputs)
{
grad += IsNaNOrInf(matrix_og[sh_out], 0);
continue;
}
//---
int sh_in = t * step_in + shift_in;
if(sh_in >= inputs)
break;
float grad_out = IsNaNOrInf(matrix_og[sh_out], 0.0f);
if(grad_out == 0.0f)
continue;
float inp = IsNaNOrInf(matrix_i[sh_in], 0.0f);
if(inp == 0.0f)
continue;
grad += IsNaNOrInf(grad_out * inp, 0);
}
//--- sum
grad = LocalSum(grad, 1, temp);
//---
if(v == 0)
{
float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0);
float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f);
float weight = matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0);
matrix_w[i] = weight;
matrix_m[i] = mt;
matrix_v[i] = vt;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskAttentionComplex(__global const float2* __attribute__((aligned(8)))q, ///<[in] Matrix of Querys
__global const float2* __attribute__((aligned(8)))kv, ///<[in] Matrix of Keys
__global float *scores, ///<[out] Matrix of Scores
__global const float *masks, ///<[in] Mask Matrix
__global float2* __attribute__((aligned(8)))out, ///<[out] Matrix of attention
const int dimension, ///< Dimension of Key
const int heads_kv
)
{
//--- init
const int q_id = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int kunits = get_local_size(1);
const int heads = get_global_size(2);
//---
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h);
const int shift_k = dimension * (2 * heads_kv * k + h_kv);
const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv);
const int shift_s = kunits * (q_id * heads + h) + k;
const float mask = IsNaNOrInf(masks[shift_s], 0);
const uint ls = min((uint)kunits, (uint)LOCAL_ARRAY_SIZE);
float2 koef = (float2)(fmax((float)sqrt((float)dimension), (float)1), 0);
__local float2 temp[LOCAL_ARRAY_SIZE];
//--- Score
float score = 0;
float2 score2 = (float2)0;
if(ComplexAbs(mask) >= 0.01f)
{
for(int d = 0; d < dimension; d++)
score2 += IsNaNOrInf2(ComplexMul(q[shift_q + d], kv[shift_k + d]), (float2)0);
score = IsNaNOrInf(ComplexAbs(ComplexExp(ComplexDiv(score, koef))) * mask, 0);
}
//--- sum of exp
//---
for(int i = 0; i < kunits; i += ls)
{
if(k >= i && k < (i + ls))
temp[k % ls].x = (i == 0 ? 0 : temp[k % ls].x) + score;
BarrierLoc
}
//---
uint count = ls;
//---
do
{
count = (count + 1) / 2;
if(k < ls)
temp[k].x += (k < count && (k + count) < kunits ? temp[k + count].x : 0);
if(k + count < ls)
temp[k + count].x = 0;
BarrierLoc
}
while(count > 1);
//--- score
if(temp[0].x > 0)
score = score / temp[0].x;
scores[shift_s] = score;
//--- out
//---
for(int d = 0; d < dimension; d++)
{
float2 val = (score > 0 ? ComplexMul(kv[shift_v + d], (float2)(score, 0)) : (float2)0);
//---
for(int i = 0; i < kunits; i += ls)
{
if(k >= i && k < (i + ls))
temp[k % ls] = (i == 0 ? (float2)0 : temp[k % ls]) + val;
BarrierLoc
}
//---
uint count = ls;
//---
do
{
count = (count + 1) / 2;
if(k < ls)
temp[k] += (k < count && (k + count) < kunits ? temp[k + count] : (float2)0);
if((k + count) < ls)
temp[k + count] = (float2)0;
BarrierLoc
}
while(count > 1);
//---
if(k == 0)
out[shift_q + d] = temp[0];
BarrierLoc
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MaskAttentionGradientsComplex(__global const float2* __attribute__((aligned(8)))q, __global float2* __attribute__((aligned(8)))q_g,
__global const float2* __attribute__((aligned(8)))kv, __global float2* __attribute__((aligned(8)))kv_g,
__global const float *scores,
__global const float *mask, __global float *mask_g,
__global const float2* __attribute__((aligned(8)))gradient,
const int kunits, const int heads_kv
)
{
//--- init
const int q_id = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (q_id * heads + h) * kunits;
const int shift_g = h * dimension + d;
float2 koef = (float2)(fmax(sqrt((float)dimension), (float)1), 0);
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v = q_id; v < kunits; v += qunits)
{
float2 grad = (float2)0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v;
for(int g = 0; g < qunits; g++)
{
float sc = IsNaNOrInf(scores[shift_score + g * step_score], 0);
if(sc > 0)
grad += ComplexMul(gradient[shift_g + dimension * (hq - h + g * heads)],
(float2)(sc, 0));
}
}
int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d;
kv_g[shift_v] = grad;
}
}
//--- Calculating Query's gradients
float2 grad = 0;
float2 out_g = IsNaNOrInf2(gradient[shift_g + q_id * dimension], (float2)0);
int shift_val = (heads_kv + h_kv) * dimension + d;
int shift_key = h_kv * dimension + d;
//---
for(int k = 0; (k < kunits && ComplexAbs(out_g) != 0); k++)
{
float2 sc_g = 0;
float2 sc = (float2)(scores[shift_s + k], 0);
for(int v = 0; v < kunits; v++)
sc_g += IsNaNOrInf2(ComplexMul(
ComplexMul((float2)(scores[shift_s + v], 0),
out_g * kv[shift_val + 2 * v * heads_kv * dimension]),
((float2)(k == v, 0) - sc)), (float2)0);
float m = mask[shift_s + k];
mask_g[shift_s + k] = IsNaNOrInf(sc.x / m * sc_g.x + sc.y / m * sc_g.y, 0);
grad += IsNaNOrInf2(ComplexMul(sc_g, kv[shift_key + 2 * k * heads_kv * dimension]), (float2)0);
}
q_g[shift_q] = IsNaNOrInf2(ComplexDiv(grad, koef), (float2)0);
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k = q_id; k < kunits; k += qunits)
{
int shift_k = dimension * (2 * heads_kv * k + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq++)
{
int shift_score = hq * kunits + k;
float2 val = IsNaNOrInf2(kv[shift_k + heads_kv * dimension], (float2)0);
for(int scr = 0; scr < qunits; scr++)
{
float2 sc_g = (float2)0;
int shift_sc = scr * kunits * heads;
float2 sc = (float2)(IsNaNOrInf(scores[shift_sc + k], 0), 0);
if(ComplexAbs(sc) == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += IsNaNOrInf2(
ComplexMul(
ComplexMul((float2)(scores[shift_sc + v], 0),
gradient[shift_g + scr * dimension]),
ComplexMul(val, ((float2)(k == v, 0) - sc))),
(float2)0);
grad += IsNaNOrInf2(ComplexMul(sc_g, q[(h + scr * heads) * dimension + d]), (float2)0);
}
}
kv_g[shift_k] = IsNaNOrInf2(ComplexDiv(grad, koef), (float2)0);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CSLSTM_FeedForward(__global const float4* __attribute__((aligned(16))) concatenated,
__global float *memory,
__global float *output)
{
uint id = (uint)get_global_id(0);
uint total = (uint)get_global_size(0); // hidden size
uint idv = (uint)get_global_id(1);
uint total_v = (uint)get_global_size(1); // variables
//---
uint shift = id + total * idv;
float4 concat = concatenated[shift];
//---
float fg_s = fActivation(concat.s0, ActFunc_SIGMOID);
float fg = 1 - fActivation(1 - 1 / (fg_s * fg_s), ActFunc_TANH);
float ig = fActivation(fActivation(concat.s1, ActFunc_SIGMOID), ActFunc_TANH);
float nc = fActivation(concat.s2, ActFunc_TANH);
float og = fActivation(concat.s3, ActFunc_SIGMOID);
float mem = IsNaNOrInf(memory[shift] * fg + ig * nc, 0);
float out = IsNaNOrInf(og * fActivation(mem, ActFunc_TANH), 0);
//---
memory[shift] = mem;
output[shift] = out;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CSLSTM_CalcHiddenGradient(__global const float4* __attribute__((aligned(16))) concatenated, // Input from forward pass (W*x + U*h + b)
__global float4* __attribute__((aligned(16))) grad_concat, // Output: gradients w.r.t. gate pre-activations
__global const float* memory, // Updated memory (after forward pass)
__global const float* grad_output // dL/dOutput from the next layer
)
{
uint id = get_global_id(0); // Index within sequences
uint total = get_global_size(0); // Total size of sequences
uint idv = get_global_id(1); // Index over independent univariate sequences (e.g., features or channels) in a multivariate time series
uint shift = id + total * idv; // Flattened index
//---
float4 concat = concatenated[shift]; // Pre-activation values for all 4 gates
// --- Forward reconstruction of gates ---
float fg_s = fActivation(concat.s0, ActFunc_SIGMOID);
float fg = 1.0f - fActivation(1.0f - 1.0f / (fg_s * fg_s), ActFunc_TANH); // Forget gate (ft)
float ig_s = fActivation(concat.s1, ActFunc_SIGMOID);
float ig = fActivation(ig_s, ActFunc_TANH); // Input gate (it)
float nc = fActivation(concat.s2, ActFunc_TANH); // Candidate (ct~)
float og = fActivation(concat.s3, ActFunc_SIGMOID); // Output gate (ot)
float mem = memory[shift]; // New memory state (ct)
float mem_t = fActivation(mem, ActFunc_TANH); // tanh(ct)
// --- Reconstruct previous memory state (t-1) ---
float prev_mem = IsNaNOrInf((mem - ig * nc) / fg, 0);
// --- Gradients computation ---
float out_g = grad_output[shift];
float og_g = Deactivation(out_g * mem_t, og, ActFunc_SIGMOID);
float mem_g = Deactivation(out_g * og, mem_t, ActFunc_TANH);
float nc_g = Deactivation(mem_g * ig, nc, ActFunc_TANH);
float ig_g = Deactivation(Deactivation(mem_g * nc, ig, ActFunc_TANH), ig_s, ActFunc_SIGMOID);
// ∂L/∂fg = ∂L/∂ct * mem_(t-1)
float fg_g = mem_g * prev_mem;
// Derivative of the complex forget gate:
// f(z) = 1 - tanh(1 - 1 / σ(z)^2)
float fg_s_g = 2 / (fg_s * fg_s * fg_s) * Deactivation(-fg_g, fg, ActFunc_TANH);
fg_g = Deactivation(fg_s_g, fg_s, ActFunc_SIGMOID);
// --- Write back gradients ---
grad_concat[shift] = (float4)(fg_g, ig_g, nc_g, og_g);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ProbAttentionQeuryImp(__global const float* querys,
__global const float2* __attribute__((aligned(8))) keys_values,
__global const float* index_keys,
__global float* querys_imp,
const int dimension
)
{
const size_t id_q = get_global_id(0);
const size_t total_q = get_global_size(0);
const size_t ind_k = get_local_id(1);
const size_t total_ind = get_local_size(1);
const size_t id_h = get_global_id(2);
const size_t total_h = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE][2];
const int ls = min((int)total_ind, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_q = dimension * (id_q * total_h + id_h);
const int id_k = index_keys[ind_k * total_h + id_h];
const int shift_k = dimension * (id_k * total_h + id_h);
//---
float sum = 0;
//---
for(int d = 0; d < dimension; d++)
sum += IsNaNOrInf(querys[shift_q + d] * keys_values[shift_k + d].s0, 0);
//---
int id_t = ind_k % ls;
//---
for(int i = 0; i < total_ind; i += ls)
{
if(i <= ind_k || (i + ls) > ind_k)
{
temp[id_t][0] = IsNaNOrInf((i == 0 ? 0 : temp[id_t][0]) + sum, 0);
temp[id_t][1] = (i == 0 ? IsNaNOrInf(sum, MIN_VALUE) : fmax(temp[id_t][1], IsNaNOrInf(sum, MIN_VALUE)));
BarrierLoc
}
}
int count = ls;
//---
do
{
count = (count + 1) / 2;
if(ind_k < count && (ind_k + count) < ls)
{
temp[ind_k][0] += temp[ind_k + count][0];
temp[ind_k + count][0] = 0;
temp[ind_k][1] = fmax(temp[ind_k + count][1], temp[ind_k][1]);
}
BarrierLoc
}
while(count > 1);
if(ind_k == 0)
querys_imp[id_q * total_h + id_h] = IsNaNOrInf(temp[0][1] - temp[0][0] / total_ind, MIN_VALUE);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TopKImportanceToIndex(__global const float* importance,
__global float* indexes,
const int top_k
)
{
const size_t id_q = get_global_id(0);
const size_t total_q = get_global_size(0);
const size_t id_h = get_global_id(1);
const size_t total_h = get_global_size(1);
//---
float imp = importance[id_q * total_h + id_h];
int pos = 0;
//---
for(int i = 0; i < total_q; i++)
{
if(i == id_q)
continue;
float val = importance[i * total_h + id_h];
if(val > imp || (i < id_q && val >= imp))
pos++;
if(pos >= top_k)
break;
}
//---
if(pos < top_k)
indexes[pos * total_h + id_h] = (float)id_q;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void QIndexAttention(__global const float *q, ///<[in] Matrix of Querys
__global const float2* kv, ///<[in] Matrix of Keys
__global float *scores, ///<[out] Matrix of Scores
__global const float *indexes, ///<[in] Querys Indexes
__global float *out, ///<[out] Matrix of attention
const int dimension, ///< Dimension of Key
const int heads_kv
)
{
//--- init
const int ind_q = get_global_id(0);
const int k = get_local_id(1);
const int h = get_global_id(2);
const int total_q = get_global_size(0);
const int total_k = get_local_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int q_id = (int)(indexes[ind_q * heads + h] + 0.001f);
const int shift_q = dimension * (q_id * heads + h);
const int shift_kv = dimension * (heads_kv * k + h_kv);
const int shift_s = total_k * (ind_q * heads + h) + k;
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
float score = 0;
if(q_id >= 0)
{
//---
for(int d = 0; d < dimension; d++)
score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0);
}
else
score = MIN_VALUE;
//--- norm score
score = IsNaNOrInf(exp(score - LocalMax(score, 1, temp)), 0);
score = IsNaNOrInf(score / LocalSum(score, 1, temp), 0);
scores[shift_s] = score;
BarrierLoc
//--- out
for(int d = 0; d < dimension; d++)
{
float val = LocalSum(kv[shift_kv + d].s1 * score, 1, temp);
if(k == 0)
out[dimension * (ind_q * heads + h) + d] = temp[0];
BarrierLoc
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void QIndexAttentionGradients(__global const float* q,
__global float* q_g,
__global const float2* kv,
__global float2* kv_g,
__global const float* indexes,
__global const float* scores,
__global const float* gradient,
const int kunits, const int heads_kv
)
{
//--- init
const int ind_q = get_global_id(0);
const int d = get_global_id(1);
const int h = get_global_id(2);
const int qunits = get_global_size(0);
const int dimension = get_global_size(1);
const int heads = get_global_size(2);
const int h_kv = h % heads_kv;
const int q_id = (int)(indexes[ind_q * heads + h] + 0.001f);
const int shift_q = dimension * (q_id * heads + h) + d;
const int shift_s = (ind_q * heads + h) * kunits;
const int shift_g = h * dimension + d;
//--- Calculating Value's gradients
int step_score = kunits * heads;
if(h < heads_kv)
{
//---
for(int v = ind_q; v < kunits; v += qunits)
{
float grad = 0;
for(int hq = h; hq < heads; hq += heads_kv)
{
int shift_score = hq * kunits + v;
for(int g = 0; g < qunits; g++)
grad += IsNaNOrInf(gradient[shift_g + dimension * (hq - h + g * heads)], 0) *
scores[shift_score + g * step_score];
}
int shift_v = dimension * (heads_kv * v + h) + d;
kv_g[shift_v].s1 = IsNaNOrInf(grad, 0);
}
}
//--- Calculating Query's gradients
float grad = 0;
float out_g = IsNaNOrInf(gradient[shift_g + ind_q * dimension], 0);
int shift_kv = h_kv * dimension + d;
//---
for(int k = 0; (k < kunits && out_g != 0); k++)
{
float sc_g = 0;
float sc = scores[shift_s + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_s + v] * out_g * kv[shift_kv + v * heads_kv * dimension].s1 *
((float)(k == v) - sc);
grad += sc_g * kv[shift_kv + k * heads_kv * dimension].s0;
}
q_g[shift_q] = grad;
//--- Calculating Key's gradients
if(h < heads_kv)
{
//---
for(int k = ind_q; k < kunits; k += qunits)
{
int shift_k = dimension * (heads_kv * k + h_kv) + d;
grad = 0;
for(int hq = h; hq < heads; hq++)
{
int shift_score = hq * kunits + k;
float val = kv[shift_k + heads_kv * dimension].s1;
for(int scr = 0; scr < qunits; scr++)
{
float sc_g = 0;
int shift_sc = scr * kunits * heads;
float sc = scores[shift_sc + k];
if(sc == 0)
continue;
for(int v = 0; v < kunits; v++)
sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] *
val * ((float)(k == v) - sc);
grad += IsNaNOrInf(sc_g * q[(hq + (int)(indexes[scr * heads + hq] + 0.001f) * heads) * dimension + d], 0);
}
}
kv_g[shift_k].s0 = IsNaNOrInf(grad, 0);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void TSPositonEncoder(__global const float2* __attribute__((aligned(8))) data,
__global const float* time,
__global float2* __attribute__((aligned(8))) output,
__global const float* period
)
{
const int id = get_global_id(0);
const int freq = get_global_id(1);
const int p = get_global_id(2);
const int total = get_global_size(0);
const int freqs = get_global_size(1);
const int periods = get_global_size(2);
//---
const int shift = id * freqs + freq;
const float2 d = data[shift * periods + p];
const float t = time[id] / period[p];
float val = M_PI_F * t * pow(2.0f, freq + 1);
//---
output[shift * periods + p] = (float2)(d.s0 + sin(val), d.s1 + cos(val));
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMultWinConvWPad(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_o,
__global const int *windows_in,
const int inputs,
const int step,
const int window_out,
const int activation
)
{
const size_t id = get_global_id(0);
const size_t id_w = get_global_id(1);
const size_t v = get_global_id(2);
const size_t outputs = get_global_size(0);
const size_t windows_total = get_global_size(1);
//---
int window_in = windows_in[id_w];
int mid_win = window_in / 2;
int shift_in = id * step - mid_win;
int shift_in_var = v * inputs;
int shift_weight = 0;
//---
for(int w = 0; w < id_w; w++)
shift_weight += (windows_in[w] + 1) * window_out;
//---
for(int w_out = 0; w_out < window_out; w_out++)
{
float sum = matrix_w[shift_weight + window_in];
//---
for(int w = 0; w < window_in; w++)
if((shift_in + w) >= 0 && (shift_in + w) < inputs)
sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in + w] * matrix_w[shift_weight + w], 0);
//---
int shift_out = (v * outputs + id) * window_out + w_out;
matrix_o[shift_out] = fActivation(sum, activation);
shift_weight += window_in + 1;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMultWinConvWPad(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_ig,
__global const float *matrix_og,
__global const int *windows_in,
const int outputs,
const int step,
const int window_out,
const int filters,
const int activation
)
{
const size_t id_x = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t v = get_global_id(2);
const size_t inputs = get_global_size(0);
const size_t size_loc = get_local_size(1);
const size_t windows_total = filters / window_out;
//---
__local float temp[LOCAL_ARRAY_SIZE];
const uint ls = min((uint)size_loc, (uint)LOCAL_ARRAY_SIZE);
//---
float grad = 0;
for(int id_loc = loc; id_loc < filters; id_loc += size_loc)
{
const size_t id_win = id_loc / window_out;
const size_t id_f = id_loc % window_out;
int window_in = windows_in[id_win];
int shift_weight = id_f * (window_in + 1);
for(int w = 0; w < id_win; w++)
shift_weight += (windows_in[w] + 1) * window_out;
//---
int shift_out = max((int)((id_x - window_in) / step), 0);
//---
int mid_win = (window_in + 1) / 2;
for(int out = shift_out; out < outputs; out++)
{
int shift_in = out * step - mid_win;
if(shift_in > id_x)
break;
int shift_w = id_x - shift_in;
if(shift_w >= window_in)
continue;
int shift_g = ((v * outputs + out) * windows_total + id_win) * window_out + id_f;
grad += IsNaNOrInf(matrix_w[shift_w + shift_weight] * matrix_og[shift_g], 0);
}
}
//---
grad = LocalSum(grad, 1, temp);
//---
if(loc == 0)
matrix_ig[v * inputs + id_x] = Deactivation(grad, matrix_i[v * inputs + id_x], activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMultWinConvAdamWPad(__global float *matrix_w,
__global const float *matrix_og,
__global const float *matrix_i,
__global float *matrix_m,
__global float *matrix_v,
__global const int *windows_in,
const int windows_total,
const int window_out,
const int inputs,
const int step,
const int outputs,
const float l,
const float b1,
const float b2
)
{
const size_t i = get_global_id(0); // weight shift
const size_t v = get_local_id(1); // variable
const size_t variables = get_local_size(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
const uint ls = min((uint)variables, (uint)LOCAL_ARRAY_SIZE);
//---
int step_out = window_out * windows_total;
//---
int shift_before = 0;
int window = 0;
int number_w = 0;
for(int w = 0; w < windows_total; w++)
{
int win = windows_in[w];
if(shift_before <= i &&
(win + 1)*window_out > (i - shift_before))
{
window = win;
number_w = w;
}
else
shift_before += (win + 1) * window_out;
}
//---
int shift_in = (i - shift_before) % (window + 1);
int shift_in_var = v * inputs;
bool bias = (shift_in == window);
int mid_win = (window + 1) / 2;
int id_f = (i - shift_before) / (window + 1);
int shift_out = number_w * window_out + id_f;
int shift_out_var = v * outputs * step_out;
//---
float grad = 0;
if(!bias)
{
for(int out = 0; out < outputs; out++)
{
int in = out * step - mid_win + shift_in;
if(in >= inputs)
break;
if(in < 0)
continue;
//---
grad += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out * step_out] * matrix_i[shift_in_var + in], 0);
}
}
else
{
for(int out = 0; out < outputs; out++)
grad += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out * step_out], 0);
}
//--- sum
for(int s = 0; s < (int)variables; s += ls)
{
if(v >= s && v < (s + ls))
temp[v % ls] = (s == 0 ? 0 : temp[v % ls]) + grad;
BarrierLoc
}
//---
uint count = ls;
do
{
count = (count + 1) / 2;
if(v < count && (v + count) < ls)
{
temp[v] += temp[v + count];
temp[v + count] = 0;
}
BarrierLoc
}
while(count > 1);
//---
if(v == 0)
{
grad = temp[0];
float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0);
float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f);
float weight = matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0);
matrix_w[i] = weight;
matrix_m[i] = mt;
matrix_v[i] = vt;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ConcatDiff(__global const float* data,
__global float* output,
const int step)
{
const size_t i = get_global_id(0);
const size_t v = get_local_id(1);
const size_t inputs = get_local_size(0);
const size_t variables = get_local_size(1);
//---
const int shift = i * variables;
const float d = data[shift + v];
float diff = 0;
if(step > 0 && (i + step) < inputs)
diff = IsNaNOrInf(d - data[shift + step * variables + v], 0);
//---
output[2 * shift + v] = d;
output[2 * shift + v + variables] = diff;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMaskMultWinConv(__global const float *matrix_w,
__global const float *matrix_i,
__global const float *masks,
__global float *matrix_o,
const int inputs,
const int window_in,
const int windows_total,
const int activation
)
{
const size_t u = get_global_id(0);
const size_t w = get_global_id(1);
const size_t v = get_global_id(2);
const size_t units = get_global_size(0);
const size_t window_out = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int shift_in = u * window_in * windows_total;
const int shift_in_var = v * units * window_in * windows_total;
const int shift_out = (u + v * units) * window_out + w;
const int shift_mask = (u + v * units) * windows_total;
const int shift_weight = (v * window_out * windows_total + w) * (window_in + 1);
const int step_weight = window_out * (window_in + 1);
//---
float sum = 0;
for(int w_in = 0; w_in < windows_total; w_in++)
{
float m = IsNaNOrInf(masks[shift_mask + w_in], 0);
if(m < FLT_EPSILON)
continue;
const int shift_in_loc = shift_in + w_in * window_in;
const int shift_weight_loc = shift_weight + w_in * step_weight;
for(int i = 0; i < window_in; i++)
if((shift_in_loc + i) < (inputs / variables))
sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc + i], 0) *
matrix_w[shift_weight_loc + i] * m;
sum += matrix_w[shift_weight_loc + window_in] * m;
}
//---
matrix_o[shift_out] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMaskMultWinConv(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_ig,
__global const float *matrix_og,
__global const float *masks,
__global float *masks_g,
const int outputs,
const int window_in,
const int window_out,
const int activation
)
{
const size_t u = get_global_id(0);
const size_t w_in = get_global_id(1);
const size_t v = get_global_id(2);
const size_t units = get_global_size(0);
const size_t windows_total = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int shift_in = (u + v * units) * window_in * windows_total + w_in * window_in;
const int shift_out = u * window_out;
const int shift_out_var = v * units * window_out;
const int shift_mask = (u + v * units) * windows_total + w_in;
const int shift_weight = (v * window_out * windows_total + w_in * window_out) * (window_in + 1);
//---
const float m = IsNaNOrInf(masks[shift_mask], 0);
for(int i = 0; i < window_in; i++)
{
float sum = 0;
if(m >= FLT_EPSILON)
{
for(int out = 0; out < window_out; out++)
{
if((shift_out + out) >= (outputs / variables))
continue;
sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] *
matrix_w[shift_weight + out * (window_in + 1) + i] *
m, 0);
}
}
matrix_ig[shift_in + i] = Deactivation(sum, matrix_i[shift_in + i], activation);
}
//---
float sum = 0;
for(int out = 0; out < window_out; out++)
{
int shift_weight_loc = out * (window_in + 1) + shift_weight;
float temp = matrix_w[shift_weight_loc + window_in];
for(int i = 0; i < window_in; i++)
temp += IsNaNOrInf(matrix_i[shift_in + i], 0) * matrix_w[shift_weight_loc + i];
sum += IsNaNOrInf(temp * matrix_og[shift_out_var + shift_out + out], 0);
}
masks_g[shift_mask] = IsNaNOrInf(sum, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMaskMultWinConvAdam(__global float *matrix_w,
__global const float *matrix_og,
__global const float *matrix_i,
__global const float *masks,
__global float *matrix_m,
__global float *matrix_v,
const int windows_total,
const int inputs,
const int outputs,
const float l,
const float b1,
const float b2
)
{
const size_t id_in = get_global_id(0); // input shift
const size_t id_out = get_global_id(1); // filter shift
const size_t id_v = get_global_id(2); // variable
const size_t window_in = get_global_size(0) / windows_total - 1;
const size_t window_out = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int w_id = id_in / (window_in + 1);
const int shift_in = id_in - w_id;
const int step_in = window_in * windows_total;
const int units = outputs / window_out;
const int shift_in_var = id_v * inputs;
const int shift_out_var = id_v * outputs;
const int shift_mask_var = id_v * units * windows_total;
const int shift_weight = ((id_v * windows_total + w_id) * window_out + id_out) *
(window_in + 1) + id_in % (window_in + 1);
const bool bias = (id_in % (window_in + 1) == window_in);
//---
float grad = 0;
for(int u = 0; u < units; u++)
{
const int shift_in_loc = shift_in + u * step_in;
if(shift_in < inputs)
continue;
float m = IsNaNOrInf(masks[shift_mask_var + u * windows_total + w_id], 0);
if(m < FLT_EPSILON)
continue;
float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0));
grad += IsNaNOrInf(inp * m * matrix_og[shift_out_var + u * window_out + id_out], 0);
}
float mt = IsNaNOrInf(clamp(b1 * matrix_m[shift_weight] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0);
float vt = IsNaNOrInf(clamp(b2 * matrix_v[shift_weight] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f);
float weight = matrix_w[shift_weight] + IsNaNOrInf(l * mt / sqrt(vt), 0);
matrix_w[shift_weight] = weight;
matrix_m[shift_weight] = mt;
matrix_v[shift_weight] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MainFreq(__global const float* freq_r,
__global const float* freq_im,
__global float *main_freq,
int dimension
)
{
if(dimension <= 0)
return;
//---
size_t n = get_global_id(0);
const int shift = n * dimension;
//---
float max_f = 0;
float max_id = 0;
float energy;
//---
for(int i = 1; i < dimension; i++)
{
float2 freq = (float2)(freq_r[shift + i], freq_im[shift + i]);
energy = ComplexAbs(freq);
if(max_f < energy)
{
max_f = energy;
max_id = i + 1;
}
}
main_freq[n] = max_id;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardAdaptConv(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_o,
__global const float *main_freq,
const int inputs,
const int window_in,
const int activation
)
{
const size_t u = get_global_id(0);
const size_t f = get_global_id(1);
const size_t v = get_global_id(2);
const size_t units = get_global_size(0);
const size_t filters = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int freq = main_freq[v];
int window = (inputs / variables + freq - 1) / freq;
const int step = (int)(inputs / variables + units + 1) / (units + 2);
if(window < step)
window = (int)((step + window - 1) / window) * window;
if(window > window_in)
window = window_in;
//---
const int shift_in = (u < (units - 1) ? u * step : inputs / variables - window);
const int shift_in_var = v * inputs / variables;
const int shift_out = (u + v * units) * filters + f;
const int shift_weight = (v * filters + f) * (window_in + 1);
//---
float sum = matrix_w[shift_weight + window_in];
for(int i = 0; i < window; i++)
if((shift_in + i) < (inputs / variables))
sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in + i], 0) *
matrix_w[shift_weight + i];
//---
matrix_o[shift_out] = fActivation(sum, activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientAdaptConv(__global const float *matrix_w,
__global const float *matrix_i,
__global float *matrix_ig,
__global const float *matrix_og,
__global const float *main_freq,
const int outputs,
const int window_in,
const int window_out,
const int activation
)
{
const size_t inp = get_global_id(0);
const size_t v = get_global_id(1);
const size_t inputs = get_global_size(0);
const size_t variables = get_global_size(1);
//---
const int units = outputs / (window_out * variables);
const int freq = main_freq[v];
int window = (inputs / variables + freq - 1) / freq;
const int step = (int)(inputs + units + 1) / (units + 2);
if(window < step)
window = (int)((step + window - 1) / window) * window;
if(window > window_in)
window = window_in;
//---
const int shift_in = v * inputs + inp;
int u = inp / step;
int shift_out_var = v * (outputs / variables);
int shift_weight_var = (v * window_out) * (window_in + 1);
//---
float sum = 0;
while(u * step <= inp && u < (units - 1))
{
int pos = inp - u * step;
if(pos >= window)
{
u++;
continue;
}
int shift_out = u * window_out;
int shift_weight = pos + shift_weight_var;
for(int out = 0; out < window_out; out++)
{
if((shift_out + out) >= (outputs / variables))
continue;
sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] *
matrix_w[shift_weight + out * (window_in + 1)], 0);
}
u++;
}
if(inp >= (inputs - window))
{
int pos = inp + window - inputs;
int shift_out = (units - 1) * window_out;
int shift_weight = pos + shift_weight_var;
for(int out = 0; out < window_out; out++)
{
if((shift_out + out) >= (outputs / variables))
continue;
sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] *
matrix_w[shift_weight + out * (window_in + 1)], 0);
}
}
matrix_ig[shift_in] = Deactivation(sum, matrix_i[shift_in], activation);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdaptConvAdam(__global float *matrix_w,
__global const float *matrix_og,
__global const float *matrix_i,
__global float *matrix_m,
__global float *matrix_v,
__global float *main_freq,
const int inputs,
const int outputs,
const float l,
const float b1,
const float b2
)
{
const size_t id_in = get_global_id(0); // input shift
const size_t id_out = get_global_id(1); // filter shift
const size_t id_v = get_global_id(2); // variable
const size_t window_in = get_global_size(0) - 1;
const size_t window_out = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int units = outputs / (window_out * variables);
const int freq = main_freq[id_v];
int window = (inputs / variables + freq - 1) / freq;
const int step = (int)(inputs / variables + units + 1) / (units + 2);
if(window < step)
window = (int)((step + window - 1) / window) * window;
if(window > window_in)
window = window_in;
//---
if(id_in != window_in &&
id_in >= window)
return;
//---
const int shift_in_var = id_v * inputs / variables;
const int shift_out_var = id_v * outputs / variables;
const int shift_weight = (id_v * window_out + id_out) *
(window_in + 1) + id_in;
const bool bias = (id_in == window_in);
//---
float grad = 0;
for(int u = 0; u < (units - 1); u++)
{
const int shift_in_loc = id_in + u * step;
if(shift_in_loc >= (inputs / variables))
continue;
float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0));
grad += IsNaNOrInf(inp * matrix_og[shift_out_var + u * window_out + id_out], 0);
}
{
const int shift_in_loc = id_in + inputs / variables - window;
if(shift_in_loc < (inputs / variables))
{
float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0));
grad += IsNaNOrInf(inp * matrix_og[shift_out_var + (units - 1) * window_out + id_out], 0);
}
}
float mt = IsNaNOrInf(clamp(b1 * matrix_m[shift_weight] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0);
float vt = IsNaNOrInf(clamp(b2 * matrix_v[shift_weight] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f);
float weight = matrix_w[shift_weight] + IsNaNOrInf(l * mt / sqrt(vt), 0);
matrix_w[shift_weight] = weight;
matrix_m[shift_weight] = mt;
matrix_v[shift_weight] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RoPE(__global const float2* __attribute__((aligned(8))) inputs,
__global const float2* __attribute__((aligned(8))) position_emb,
__global float2* __attribute__((aligned(8))) outputs
)
{
const size_t id_d = get_global_id(0); // dimension
const size_t id_u = get_global_id(1); // unit
const size_t id_v = get_global_id(2); // variable
const size_t dimension = get_global_size(0);
const size_t units = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int shift_in = (id_v * units + id_u) * dimension + id_d;
const int shift_pos = id_u * dimension + id_d;
const float2 inp = inputs[shift_in];
const float2 pe = position_emb[shift_pos];
//---
float2 result = 0;
result.s0 = inp.s0 * pe.s0 - inp.s1 * pe.s1;
result.s1 = inp.s0 * pe.s1 + inp.s1 * pe.s0;
//---
outputs[shift_in] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradRoPE(__global float2* __attribute__((aligned(8))) inputs_gr,
__global const float2* __attribute__((aligned(8))) position_emb,
__global const float2* __attribute__((aligned(8))) outputs_gr
)
{
const size_t id_d = get_global_id(0); // dimension
const size_t id_u = get_global_id(1); // unit
const size_t id_v = get_global_id(2); // variable
const size_t dimension = get_global_size(0);
const size_t units = get_global_size(1);
const size_t variables = get_global_size(2);
//---
const int shift_in = (id_v * units + id_u) * dimension + id_d;
const int shift_pos = id_u * dimension + id_d;
const float2 grad = outputs_gr[shift_in];
const float2 pe = position_emb[shift_pos];
//---
float2 grad_x;
grad_x.s0 = grad.s0 * pe.s0 + grad.s1 * pe.s1;
grad_x.s1 = grad.s1 * pe.s0 - grad.s0 * pe.s1;
//---
inputs_gr[shift_in] = grad_x;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DifMatrix(__global const float *matrix1, ///<[in] First matrix
__global const float *matrix2, ///<[in] Second matrix
__global float *matrix_out, ///<[out] Output matrix
const float multiplyer, ///< Multiplyer for output
const int shift_in1, ///< Shift for input 1
const int shift_in2, ///< Shift for input 2
const int shift_out ///< Shift for output
)
{
const int i = get_global_id(0);
const int d = get_global_id(1);
const int step = get_global_size(0);
const int dimension = get_global_size(1);
//---
int index = i * dimension + d;
matrix_out[i * shift_out + index] =
IsNaNOrInf((matrix1[i * shift_in1 + index] - matrix2[i * shift_in2 + index]) * multiplyer, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DifMatrixGrad(__global float *matrix1, ///<[in] First matrix
__global float *matrix2, ///<[in] Second matrix
__global const float *matrix_out, ///<[out] Output matrix
const float multiplyer, ///< Multiplyer for output
const int shift_in1, ///< Shift for input 1
const int shift_in2, ///< Shift for input 2
const int shift_out ///< Shift for output
)
{
const int i = get_global_id(0);
const int d = get_global_id(1);
const int step = get_global_size(0);
const int dimension = get_global_size(1);
//---
int index = i * dimension + d;
float grad = IsNaNOrInf(matrix_out[i * shift_out + index] * multiplyer, 0);
matrix1[i * shift_in1 + index] = grad;
matrix2[i * shift_in2 + index] = -grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void IdentitySumMatrix(__global const float *matrix_in,
__global float *matrix_out,
const float multiplyer,
const int shift_in,
const int shift_out
)
{
const int i = get_global_id(0);
const int d = get_global_id(1);
const int step = get_global_size(0);
const int dimension = get_global_size(1);
//---
int index = i * dimension + d;
matrix_out[i * shift_out + index] =
IsNaNOrInf(((int)(i == d) + matrix_in[i * shift_in + index]) * multiplyer, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void IdentityDifMatrix(__global const float *matrix_in,
__global float *matrix_out,
const float multiplyer,
const int shift_in,
const int shift_out
)
{
const int i = get_global_id(0);
const int d = get_global_id(1);
const int step = get_global_size(0);
const int dimension = get_global_size(1);
//---
int index = i * dimension + d;
matrix_out[i * shift_out + index] =
IsNaNOrInf(((int)(i == d) - matrix_in[i * shift_in + index]) * multiplyer, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void IdentityDifMatrixGrad(__global float *matrix_in,
__global const float *matrix_out,
const float multiplyer,
const int shift_in,
const int shift_out
)
{
const int i = get_global_id(0);
const int d = get_global_id(1);
const int step = get_global_size(0);
const int dimension = get_global_size(1);
//---
int index = i * dimension + d;
matrix_in[i * shift_in + index] = IsNaNOrInf(-multiplyer * matrix_out[i * shift_out + index], 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SumVecMatrix(__global const float *vector_in,
__global const float *matrix_in,
__global float *matrix_out,
const float multiplyer, ///< Multiplyer for output
const int shift_in1, ///< Shift for input 1
const int shift_in2, ///< Shift for input 2
const int shift_out ///< Shift for output
)
{
const int r = get_global_id(0);
const int c = get_global_id(1);
const int v = get_global_id(2);
const int rows = get_global_size(0);
const int cols = get_global_size(1);
const int variables = get_global_size(2);
//---
int flat_m = RCtoFlat(r, c, rows, cols, v);
int flat_v = RCtoFlat(0, c, 1, cols, v);
matrix_out[flat_m] = IsNaNOrInf((vector_in[flat_v] + matrix_in[flat_m]) * multiplyer, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SumVecMatrixGrad(__global float *vector_in,
__global float *matrix_in,
__global const float *matrix_out,
const float multiplyer, ///< Multiplyer for output
const int shift_in1, ///< Shift for input 1
const int shift_in2, ///< Shift for input 2
const int shift_out ///< Shift for output
)
{
const int r = get_global_id(0);
const int c = get_global_id(1);
const int v = get_global_id(2);
const int rows = get_global_size(0);
const int cols = get_global_size(1);
const int variables = get_global_size(2);
//---
int flat_m = RCtoFlat(r, c, rows, cols, v);
int flat_v = RCtoFlat(0, c, 1, cols, v);
//---
float grad = IsNaNOrInf(matrix_out[flat_m] * multiplyer, 0);
matrix_in[flat_m] = grad;
//---
if(r == 0)
{
for(int i = 1; i < rows; i++)
{
flat_m += cols;
grad += IsNaNOrInf(matrix_out[flat_m] * multiplyer, 0);
}
vector_in[flat_v] = IsNaNOrInf(grad / rows, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void InterpolationAttention(__global const float* matrix_in,
__global const float* W,
__global const float* A,
__global const float* GL,
__global float* Adj,
__global float* H,
__global float* Atten,
__global float* matrix_out,
const int dimension
)
{
const size_t i = get_global_id(0);
const size_t j = get_local_id(1);
const size_t total = get_global_size(0);
const size_t total_loc = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift_i = i * dimension;
const int shift_j = j * dimension;
const int shift_adj = i * total_loc + j;
//---
float adj = 0;
for(int d = 0; d < dimension; d++)
adj += IsNaNOrInf(GL[shift_i + d] * GL[shift_j + d], 0);
adj = max(IsNaNOrInf(adj, 0), 0.0f);
adj = LocalSoftMax(adj, 1, Temp);
Adj[shift_adj] = adj;
adj += (float)(i == j);
//---
for(int id_h = 0; id_h < dimension; id_h += total_loc)
{
if(j >= (dimension - id_h))
break;
float h = 0;
for(int w = 0; w < dimension; w++)
h += IsNaNOrInf(matrix_in[shift_i + w] * W[(id_h + j) * dimension + w], 0);
H[shift_i + id_h + j] = h;
BarrierLoc
}
float e = 1e-12f;
if(adj > 0)
{
e = 0;
for(int a = 0; a < dimension; a++)
e += IsNaNOrInf(H[shift_i + a] * A[a], 0) + IsNaNOrInf(H[shift_j + a] * A[dimension + a], 0);
}
e = LocalSoftMax(e, 1, Temp);
Atten[shift_adj] = e;
//--- Scale output by attention
for(int d = 0; d < dimension; d += total_loc)
{
if(j >= (dimension - d))
break;
float out = 0;
int shift_h = d + j;
int shift_att = i * total_loc;
int shift_out = i * dimension + shift_h;
for(int n = 0; n < total_loc; n++)
out += IsNaNOrInf(H[shift_h + n * dimension] * Atten[shift_att + n], 0);
matrix_out[shift_out] = fActivation(out, ActFunc_LReLU);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void InterpolationAttentionGrad(__global const float* matrix_in,
__global float* matrix_in_gr,
__global const float* W,
__global float* W_gr,
__global const float* A,
__global float* A_gr,
__global const float* GL,
__global float* GL_gr,
__global float* Adj,
__global float* H,
__global float* H_gr,
__global float* Atten,
__global float* matrix_out_gr,
const int dimension
)
{
const size_t i = get_global_id(0);
const size_t j = get_local_id(1);
const size_t total = get_global_size(0);
const size_t total_loc = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift_i = i * dimension;
const int shift_j = j * dimension;
const int shift_adj = i * total_loc + j;
//--- H Gradient
for(int d = 0; d < dimension; d += total_loc)
{
if(j >= (dimension - d))
break;
float h_grad = 0;
int shift_h = shift_i + d + j;
int shift_att = i;
int shift_out = d + j;
for(int n = 0; n < total_loc; n++)
{
float gr = matrix_out_gr[shift_out + n * dimension];
h_grad += IsNaNOrInf(
Deactivation(gr, gr, ActFunc_LReLU) * Atten[shift_att + n * total_loc], 0);
}
H_gr[shift_h] = h_grad;
BarrierLoc
}
//--- Attention Gradient
float att_grad = 0;
for(int d = 0; d < dimension; d++)
{
float gr = matrix_out_gr[shift_i + d];
gr = Deactivation(gr, gr, ActFunc_LReLU);
att_grad += IsNaNOrInf(gr * H[shift_j + d], 0);
}
att_grad = LocalSoftMaxGrad(Atten[shift_adj], att_grad, 1, Temp);
//--- Add H Gradient
for(int d = 0; d < dimension; d++)
{
float h_grad = att_grad * A[d];
h_grad = LocalSum(h_grad, 1, Temp);
if(j == 0)
H_gr[shift_i + d] += h_grad;
h_grad = att_grad * A[dimension + d];
h_grad = LocalSum(h_grad, 1, Temp);
if(j == 0)
H_gr[shift_j + d] += h_grad;
float a_grad = att_grad * H[shift_i + d];
a_grad = LocalSum(a_grad, 1, Temp);
A_gr[d] += a_grad;
a_grad = att_grad * H[shift_j + d];
a_grad = LocalSum(a_grad, 1, Temp);
A_gr[dimension + d] += a_grad;
}
//--- Inputs' Gradient
for(int d = 0; d < dimension; d += total_loc)
{
if(j >= (dimension + d))
break;
float grad = 0;
for(int w = 0; w < dimension; w++)
grad += IsNaNOrInf(H_gr[shift_i + w] * W[(d + j) + dimension * w], 0);
matrix_in_gr[shift_i + d + j] = grad;
BarrierLoc
}
//--- Adj Gradient
float grad = LocalSoftMaxGrad(Adj[shift_adj], att_grad, 1, Temp);
for(int d = 0; d < dimension; d++)
{
GL_gr[shift_i + d] += IsNaNOrInf(grad * GL[shift_j + d], 0);
GL_gr[shift_j + d] += IsNaNOrInf(grad * GL[shift_i + d], 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PeriodNorm(__global const float* inputs,
__global float2* mean_stdevs,
__global float* outputs,
const int total_inputs
)
{
const size_t i = get_global_id(0);
const size_t p = get_local_id(1);
const size_t v = get_global_id(2);
const size_t windows = get_global_size(0);
const size_t period = get_local_size(1);
const size_t variable = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift_i = i * period + p;
const int shift_v = v * total_inputs;
const int shift_ms = v * windows + i;
//---
float val = 0;
if((shift_i) < total_inputs)
val = IsNaNOrInf(inputs[shift_v + shift_i], 0);
float mean = IsNaNOrInf(LocalSum(val, 1, Temp) / period, 0);
val -= mean;
BarrierLoc
float stdev = LocalSum(val * val, 1, Temp) / period;
stdev = IsNaNOrInf(sqrt(stdev), 1);
//---
mean_stdevs[shift_ms] = (float2)(mean, stdev);
if((shift_i) < total_inputs)
outputs[shift_v + shift_i] = IsNaNOrInf(val / stdev, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PeriodNormGrad(__global const float* inputs,
__global float* inputs_gr,
__global const float2* mean_stdevs,
__global const float2* mean_stdevs_gr,
__global const float* outputs,
__global const float* outputs_gr,
const int total_inputs
)
{
const size_t i = get_global_id(0);
const size_t p = get_local_id(1);
const size_t v = get_global_id(2);
const size_t windows = get_global_size(0);
const size_t period = get_local_size(1);
const size_t variable = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift_i = i * period + p;
const int shift_v = v * total_inputs;
const int shift_ms = v * windows + i;
//---
float inp = 0;
float inp_gr = 0;
float out = 0;
float out_gr = 0;
const float2 mean_stdev = mean_stdevs[shift_ms];
const float2 mean_stdev_gr = mean_stdevs_gr[shift_ms];
if((shift_i) < total_inputs)
{
inp = IsNaNOrInf(inputs[shift_v + shift_i], 0);
out = IsNaNOrInf(outputs[shift_v + shift_i], 0);
out_gr = IsNaNOrInf(outputs_gr[shift_v + shift_i], 0);
}
float mean_gr = LocalSum(out_gr, 1, Temp) / period + IsNaNOrInf(mean_stdev.x, 0);
BarrierLoc
float stdev_gr = out * LocalSum(IsNaNOrInf(out * out_gr, 0), 1, Temp) / period + IsNaNOrInf(mean_stdev.y, 0);
inp_gr = (out_gr - mean_gr - stdev_gr) / IsNaNOrInf(mean_stdev.y, 1);
//---
if((shift_i) < total_inputs)
inputs_gr[shift_v + shift_i] = IsNaNOrInf(inp_gr, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AdaptSpatialNorm(__global const float* inputs,
__global const float* attention,
__global float2* mean_stdevs,
__global float* outputs
)
{
const size_t i = get_global_id(0);
const size_t a = get_local_id(1);
const size_t v = get_global_id(2);
const size_t total_inputs = get_global_size(0);
const size_t total_local = get_local_size(1);
const size_t variables = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift_v = v * total_inputs;
const int shift_out = shift_v + i;
//---
float mean = 0, stdev = 0;
for(uint l = 0; l < variables; l += total_local)
{
const int shift_at = v * variables + (a + l);
float val = IsNaNOrInf(inputs[(a + l) * total_inputs + i], 0);
float att = IsNaNOrInf(attention[shift_at], 0);
mean += val * att;
stdev += val * val * att;
}
mean = LocalSum(mean, 1, Temp);
BarrierLoc
stdev = LocalSum(stdev, 1, Temp);
//---
if(a == 0)
{
stdev -= mean * mean;
stdev = IsNaNOrInf(sqrt(stdev), 1);
if(stdev <= 0)
stdev = 1;
mean_stdevs[shift_out] = (float2)(mean, stdev);
outputs[shift_out] = IsNaNOrInf((inputs[shift_out] - mean) / stdev, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AdaptSpatialNormGrad(__global const float* inputs,
__global float* inputs_gr,
__global const float* attention,
__global float* attention_gr,
__global const float2* mean_stdevs,
__global const float2* mean_stdevs_gr,
__global const float* outputs_gr,
const uint total_inputs
)
{
const size_t i = get_global_id(0); // main
const size_t loc = get_local_id(1); // local to sum
const size_t v = get_global_id(2); // variable
const size_t total_main = get_global_size(0); // total
const size_t total_loc = get_local_size(1); // local dimension
const size_t variables = get_global_size(2); // total variables
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//--- Inputs gradient
{
int shift_in = v * total_inputs + i;
float grad = 0;
if(i < total_inputs)
{
float x = IsNaNOrInf(inputs[shift_in], 0);
for(int l = 0; l < variables; l += total_loc)
{
if((l + loc) >= variables)
break;
int shift_out = i + (l + loc) * total_inputs;
float att = IsNaNOrInf(attention[(l + loc) * variables + v], 0);
float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0);
float2 ms = mean_stdevs[shift_out];
float2 ms_gr = mean_stdevs_gr[shift_out];
float dy = (1 - att) * (1 / ms.y - (x - ms.x) * att * x / (ms.y * ms.y * ms.y));
float dmean = IsNaNOrInf(ms_gr.x * att, 0);
float dstd = IsNaNOrInf(ms_gr.y * x * (att - att * att) / ms.y, 0);
grad += IsNaNOrInf(dy * out_gr + dmean + dstd, 0);
}
}
grad = LocalSum(grad, 1, Temp);
if(loc == 0 && i < total_inputs)
inputs_gr[shift_in] = grad;
BarrierLoc
}
//--- Attention gradient
{
int shift_att = v * variables + i;
float grad = 0;
if(i < variables)
{
float att = IsNaNOrInf(attention[shift_att], 0);
for(int l = 0; l < total_inputs; l += total_loc)
{
if((l + loc) >= total_inputs)
break;
int shift_out = (l + loc) + v * total_inputs;
int shift_in = (l + loc) + i * total_inputs;
float x = IsNaNOrInf(inputs[shift_in], 0);
float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0);
float2 ms = mean_stdevs[shift_out];
float2 ms_gr = mean_stdevs_gr[shift_out];
float dy = -x / ms.y - (x - ms.x) * x * x * (1 - 2 * att) / (2 * ms.y * ms.y * ms.y);
float dmean = IsNaNOrInf(ms_gr.x * x, 0);
float dstd = IsNaNOrInf(ms_gr.y * x * x * (1 - 2 * att) / (2 * ms.y), 0);
grad += IsNaNOrInf(dy * out_gr + dmean + dstd, 0);
}
}
grad = LocalSum(grad, 1, Temp);
if(loc == 0 && i < variables)
attention_gr[shift_att] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AttentNorm(__global const float* inputs,
__global const float* attention,
__global float* means,
__global float* stdevs,
__global float* outputs,
const int total_inputs,
const int segment_size
)
{
const size_t s = get_global_id(0);
const size_t i = get_local_id(1);
const size_t v = get_global_id(2);
const size_t total_segments = get_global_size(0);
const size_t total_local = get_local_size(1);
const size_t variables = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int shift = v * total_inputs + s * segment_size + i;
//---
float mean = 0, stdev = 0;
float val = 0;
for(uint l = 0; l < segment_size; l += total_local)
{
if((l + i) >= segment_size ||
(s * segment_size + l + i) >= total_inputs)
break;
float val_l = IsNaNOrInf(inputs[shift + l], 0);
if(l == 0)
val = val_l;
float att = IsNaNOrInf(attention[v * segment_size + l + i], 0);
mean += val_l * att;
stdev += val_l * val_l * att;
}
mean = LocalSum(mean, 1, Temp);
BarrierLoc
stdev = LocalSum(stdev, 1, Temp);
//---
stdev -= mean * mean;
stdev = IsNaNOrInf(sqrt(stdev), 1);
if(stdev <= 0)
stdev = 1;
//---
if(i == 0)
{
int shift_ms = v * total_segments + s;
means[shift_ms] = mean;
stdevs[shift_ms] = stdev;
}
for(uint l = 0; l < segment_size; l += total_local)
{
if((l + i) >= segment_size ||
(s * segment_size + l + i) >= total_inputs)
break;
if(l > 0)
val = inputs[shift + l];
outputs[shift + l] = IsNaNOrInf((val - mean) / stdev, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AttentNormGrad(__global const float* inputs,
__global float* inputs_gr,
__global const float* attention,
__global float* attention_gr,
__global const float* means,
__global const float* stdevs,
__global const float* means_gr,
__global const float* outputs_gr,
const int total_inputs,
const int segment_size
)
{
const size_t i = get_global_id(0); // main
const size_t loc = get_local_id(1); // local to sum
const size_t v = get_global_id(2); // variable
const size_t total_main = get_global_size(0); // total
const size_t total_loc = get_local_size(1); // local dimension
const size_t variables = get_global_size(2); // total variables
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//--- Inputs gradient
{
const int s = i / segment_size;
const int shift_in = v * total_inputs + i;
const int shift_ms = v * segment_size + s;
float grad = 0;
if(loc == 0 && i < total_inputs)
{
Temp[0] = IsNaNOrInf(inputs[shift_in], 0);
Temp[1] = IsNaNOrInf(means[shift_ms], 0);
Temp[2] = IsNaNOrInf(stdevs[shift_ms], 1);
Temp[3] = IsNaNOrInf(means_gr[shift_ms], 0);
Temp[4] = IsNaNOrInf(attention[(v - s) * segment_size + i], 0);
}
BarrierLoc
if(i < total_inputs)
{
float x = Temp[0];
float mean = Temp[1];
float stdev = Temp[2];
float mean_gr = Temp[3];
float att = Temp[4];
for(int l = 0; l < segment_size; l += total_loc)
{
if((l + loc) >= segment_size ||
(i * segment_size + loc + l) >= total_inputs)
break;
float out_gr = IsNaNOrInf(outputs_gr[v * total_inputs + s * segment_size + loc + l], 0);
bool same = (i - s * segment_size) == (loc + l);
float xl = x;
if(!same)
xl = IsNaNOrInf(inputs[v * total_inputs + s * segment_size + loc + l], 0);
float dy = ((int)same - att) * (1 / stdev - (xl - mean) * att * x / (stdev * stdev * stdev));
float dmean = (same ? IsNaNOrInf(mean_gr * att, 0) : 0);
grad += IsNaNOrInf(dy * out_gr + dmean, 0);
}
}
grad = LocalSum(grad, 1, Temp);
if(loc == 0 && i < total_inputs)
inputs_gr[shift_in] = grad;
BarrierLoc
}
//--- Attention gradient
{
float grad = 0;
int shift_att = v * segment_size + i;
if(i < segment_size)
{
float att = IsNaNOrInf(attention[shift_att], 0);
for(int l = 0; l < total_inputs; l += total_loc)
{
if((l + loc) >= total_inputs)
break;
int shift_out = (l + loc) + v * total_inputs;
int s = (l + loc) / segment_size;
int shift_in = v * total_inputs + s * segment_size + i;
float x = IsNaNOrInf(inputs[shift_in], 0);
float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0);
float mean = means[v * segment_size + s];
float stdev = stdevs[v * segment_size + s];
float mean_gr = means_gr[v * segment_size + s];
bool same = (i - s * segment_size) == (loc + l);
float xl = x;
if(!same)
xl = IsNaNOrInf(inputs[shift_out], 0);
float dy = -x / stdev - (xl - mean) * x * x * (1 - 2 * att) / (2 * stdev * stdev * stdev);
float dmean = IsNaNOrInf(mean_gr * x, 0);
grad += IsNaNOrInf(dy * out_gr + dmean, 0);
}
}
grad = LocalSum(grad, 1, Temp);
if(loc == 0 && i < segment_size)
attention_gr[shift_att] = grad;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ChebStep(__global const float* support,
__global float* outputs,
const int step
)
{
const size_t l = get_local_id(0);
const size_t r = get_global_id(1);
const size_t c = get_global_id(2);
const size_t total_l = get_local_size(0);
const size_t total_r = get_global_size(1);
const size_t total_c = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
if(step <= 0 || total_r != total_c)
return;
//---
if(step <= 3)
{
const float diag = (r == c ? 1.0f : 0.0f);
if(l == 0)
outputs[RCtoFlat(r, c, total_r, total_c, 0)] = diag;
if(step < 2)
return;
if(l == 0)
{
const float s = IsNaNOrInf(support[RCtoFlat(r, c, total_r, total_c, 0)], 0);
outputs[RCtoFlat(r, c, total_r, total_c, 1)] = s;
}
if(step < 3)
return;
float out = 0;
for(int t = 0; t < total_c; t += total_l)
{
const float s1 = IsNaNOrInf(support[RCtoFlat(r, t + l, total_r, total_c, 0)], 0);
const float s2 = IsNaNOrInf(support[RCtoFlat(t + l, c, total_r, total_c, 0)], 0);
out += IsNaNOrInf(s1 * s2, 0);
}
out = 2 * LocalSum(out, 0, Temp);
if(l == 0)
{
out -= diag;
outputs[RCtoFlat(r, c, total_r, total_c, 2)] = IsNaNOrInf(out, 0);
}
return;
}
//---
float out = 0;
for(int t = 0; t < total_c; t += total_l)
{
if((t + l) >= total_c)
continue;
const float s1 = IsNaNOrInf(support[RCtoFlat(r, t + l, total_r, total_c, 0)], 0);
const float s2 = IsNaNOrInf(outputs[RCtoFlat(t + l, c, total_r, total_c, step - 2)], 0);
out += IsNaNOrInf(s1 * s2, 0);
}
out = 2 * LocalSum(out, 0, Temp);
if(l == 0)
{
out -= IsNaNOrInf(outputs[RCtoFlat(r, c, total_r, total_c, step - 3)], 0);
outputs[RCtoFlat(r, c, total_r, total_c, step - 1)] = IsNaNOrInf(out, 0);
}
return;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ChebStepGrad(__global const float* support,
__global float* support_g,
__global const float* outputs,
__global float* outputs_g,
const int step
)
{
const size_t l = get_local_id(0);
const size_t r = get_global_id(1);
const size_t c = get_global_id(2);
const size_t total_l = get_local_size(0);
const size_t total_r = get_global_size(1);
const size_t total_c = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
if(step < 1 || total_r != total_c)
return;
//---
if(step >= 2)
{
float grad = IsNaNOrInf(outputs_g[RCtoFlat(r, c, total_r, total_c, step)], 0);
if(l == 0)
outputs_g[RCtoFlat(r, c, total_r, total_c, step - 2)] -= grad;
//--- support grad
grad = 0;
for(int t = 0; t < total_c; t += total_l)
{
if((t + l) >= total_c)
continue;
const float s2 = IsNaNOrInf(outputs[RCtoFlat(c, t + l, total_r, total_c, step - 2)], 0);
grad += IsNaNOrInf(outputs_g[RCtoFlat(r, t + l, total_r, total_c, step)] * s2, 0);
}
grad = LocalSum(grad, 0, Temp);
if(l == 0)
outputs_g[RCtoFlat(r, c, total_r, total_c, 1)] += grad;
BarrierLoc
//--- T(k-1) grad
grad = 0;
for(int t = 0; t < total_c; t += total_l)
{
if((t + l) >= total_c)
continue;
const float s2 = IsNaNOrInf(support[RCtoFlat(t + l, r, total_r, total_c, 0)], 0);
grad += IsNaNOrInf(outputs_g[RCtoFlat(t + l, c, total_r, total_c, step)] * s2, 0);
}
grad = LocalSum(grad, 0, Temp);
if(l == 0)
outputs_g[RCtoFlat(r, c, total_r, total_c, step - 1)] += grad;
}
//---
if(step <= 2)
{
if(l == 0)
support_g[RCtoFlat(r, c, total_r, total_c, 0)] = outputs_g[RCtoFlat(r, c, total_r, total_c, 1)];
return;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SignificantNeighborsSampling(__global const float *data,
__global const float *candidates,
__global const float *random_cands,
__global float *neighbors,
const int dimension
)
{
const size_t main = get_global_id(0);
const size_t slave = get_local_id(1);
const int total_main = (int)get_global_size(0);
const int total_slave = (int)get_local_size(1);
//---
__local int Idx[LOCAL_ARRAY_SIZE];
__local float Temp[LOCAL_ARRAY_SIZE];
const int ls = min(total_slave, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_main = RCtoFlat(main, 0, total_main, dimension, 0);
int cand = (int)candidates[slave];
int rand_cand = (int)random_cands[slave];
//--- duplicate check
if(rand_cand == cand)
rand_cand = -1;
//--- Look in candidates
for(int l = 0; l < total_slave; l += ls)
{
if(slave >= l && slave < (l + ls))
Idx[slave - l] = cand;
BarrierLoc
for(int i = 0; i < ls; i++)
{
if(i >= (slave - l))
continue;
if(cand == Idx[i])
cand = -1;
if(rand_cand == Idx[i])
rand_cand = -1;
}
BarrierLoc
}
//--- Look in random candidates
for(int l = 0; l < total_slave; l += ls)
{
if(slave >= l && slave < (l + ls))
Idx[slave - l] = rand_cand;
BarrierLoc
for(int i = 0; i < ls; i++)
{
if(i >= (slave - l))
continue;
if(cand == Idx[i])
cand = -1;
if(rand_cand == Idx[i])
rand_cand = -1;
}
BarrierLoc
}
//---
const int shift_cand = RCtoFlat(cand, 0, total_main, dimension, 0);
const int shift_rand_cand = RCtoFlat(rand_cand, 0, total_main, dimension, 0);
//--- calc distance
float dist_cand = 0;
float dist_rand_cand = 0;
for(int d = 0; d < dimension; d++)
{
float value = IsNaNOrInf(data[shift_main + d], 0);
if(main != cand && cand >= 0)
{
float delta = value - IsNaNOrInf(data[shift_cand + d], 0);
dist_cand += delta * delta;
}
if(main != rand_cand && rand_cand >= 0)
{
float delta = value - IsNaNOrInf(data[shift_rand_cand + d], 0);
dist_rand_cand += delta * delta;
}
}
//--- calc position
int cand_position = 0;
int rand_position = (int)(dist_cand >= dist_rand_cand);
//--- by candidates
for(int l = 0; l < total_slave; l += ls)
{
if(slave >= l && slave < (l + ls))
Temp[slave - l] = (cand >= 0 ? IsNaNOrInf(dist_cand, -1) : -1);
BarrierLoc
for(int i = 0; i < ls; i++)
{
if(i == (slave - l))
continue;
if(Temp[i] < 0)
continue;
if(cand >= 0)
{
if(Temp[i] < dist_cand)
cand_position++;
else
if(Temp[i] < dist_cand && i < (slave - l))
cand_position++;
}
if(rand_cand >= 0)
{
if(Temp[i] < dist_rand_cand)
rand_position++;
else
if(Temp[i] < dist_rand_cand && i < (slave - l))
rand_position++;
}
}
BarrierLoc
}
//--- by random candidates
for(int l = 0; l < total_slave; l += ls)
{
if(slave >= l && slave < (l + ls))
Temp[slave - l] = (rand_cand >= 0 ? IsNaNOrInf(dist_rand_cand, -1) : -1);
BarrierLoc
for(int i = 0; i < ls; i++)
{
if(i == (slave - l))
continue;
if(Temp[i] < 0)
continue;
if(cand >= 0)
{
if(Temp[i] < dist_cand)
cand_position++;
else
if(Temp[i] < dist_cand && i < (slave - l))
cand_position++;
}
if(rand_cand >= 0)
{
if(Temp[i] < dist_rand_cand)
rand_position++;
else
if(Temp[i] < dist_rand_cand && i < (slave - l))
rand_position++;
}
}
BarrierLoc
}
//--- result
if(cand >= 0 && cand_position < total_slave)
{
const int shift_dist_cand = RCtoFlat(main, cand_position, total_main, total_slave, 0);
neighbors[shift_dist_cand] = cand;
}
if(rand_cand >= 0 && rand_position < total_slave)
{
const int shift_dist_cand = RCtoFlat(main, rand_position, total_main, total_slave, 0);
neighbors[shift_dist_cand] = rand_cand;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseMHScores(__global const float* data,
__global const float* indexes,
__global float* scores,
const float sparse ///< [0.0 .. 1.0) coefficient of sparse
)
{
const int main = (int)get_global_id(0);
const int slave = (int)get_local_id(1);
const int head = (int)get_global_id(2);
const int total_mains = (int)get_global_size(0);
const int total_slaves = (int)get_local_size(1);
const int total_heads = (int)get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
float value = IsNaNOrInf(data[RCtoFlat(main, head, total_mains, 2 * total_heads, 0)], 0);
int slave_id = (int)indexes[RCtoFlat(main, slave, total_mains, total_slaves, 0)];
if(slave_id < total_mains && slave_id >= 0)
value += IsNaNOrInf(data[RCtoFlat(slave_id, head + total_heads, total_mains, 2 * total_heads, 0)], 0);
//---
const float max_value = LocalMax(value, 1, Temp);
const float min_value = LocalMin(value, 1, Temp);
const float threshold = (max_value - min_value) * sparse + min_value;
value = (threshold <= value ? IsNaNOrInf(exp(value - max_value), 0) : 0);
const float sum = LocalSum(value, 1, Temp);
value = IsNaNOrInf(value / sum, 0);
//---
scores[RCtoFlat(slave, head, total_slaves, total_heads, main)] = value;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseMHScoresGrad(__global float* data_gr,
__global const float* indexes,
__global const float* scores,
__global const float* scores_gr
)
{
const int main = (int)get_global_id(0);
const int slave = (int)get_local_id(1);
const int head = (int)get_global_id(2);
const int total_mains = (int)get_global_size(0);
const int total_slaves = (int)get_local_size(1);
const int total_heads = (int)get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const uint ls = min((uint)total_slaves, (uint)LOCAL_ARRAY_SIZE);
//--- Calc grad by main
{
float value = IsNaNOrInf(scores[RCtoFlat(slave, head, total_slaves, total_heads, main)], 0);
int slave_id = (int)indexes[RCtoFlat(main, slave, total_mains, total_slaves, 0)];
const float sc_gr = IsNaNOrInf(scores_gr[RCtoFlat(slave, head, total_slaves, total_heads, main)], 0);
//---
float grad = 0;
for(uint d = 0; d < total_slaves; d += ls)
{
if(slave >= d && slave < (d + ls))
Temp[slave - d] = IsNaNOrInf(sc_gr, 0);
BarrierLoc
for(uint l = 0; l < min(ls, (uint)(total_slaves - d)); l++)
grad += IsNaNOrInf(Temp[l] * ((float)((d + l) == slave && slave_id == main) - value), 0);
BarrierLoc
}
grad = LocalSum(grad, 1, Temp);
if(slave == 0)
data_gr[RCtoFlat(main, head, total_mains, 2 * total_heads, 0)] = grad;
}
//--- Calc grad by slave
{
float grad = 0;
for(uint d = 0; d < total_mains; d++)
{
float value = IsNaNOrInf(scores[RCtoFlat(slave, head, total_slaves, total_heads, d)], 0);
const float sc_gr = IsNaNOrInf(scores_gr[RCtoFlat(slave, head, total_slaves, total_heads, d)], 0);
int slave_id = (int)indexes[RCtoFlat(d, slave, total_mains, total_slaves, 0)];
//---
float gr = IsNaNOrInf(sc_gr * ((float)(slave_id == d) - value), 0);
gr = LocalSum(gr, 1, Temp);
if(slave == 0)
grad += gr;
}
if(slave == 0)
data_gr[RCtoFlat(main, head + total_heads, total_mains, 2 * total_heads, 0)] = IsNaNOrInf(grad, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseMatMult(__global const float *sparse_index,
__global const float *sparse_data,
__global const float *full,
__global float *result,
const int full_rows
)
{
const size_t sparse_row = get_global_id(0);
const size_t sparse_col = get_local_id(1);
const size_t full_col = get_global_id(2);
const size_t sparse_rows = get_global_size(0);
const size_t sparse_cols = get_local_size(1);
const size_t full_cols = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
const int shift_sparse = RCtoFlat(sparse_row, sparse_col, sparse_rows, sparse_cols, 0);
const int full_row = sparse_index[shift_sparse];
const int shift_full = RCtoFlat(full_row, full_col, full_rows, full_cols, 0);
//---
float res = (full_row >= 0 && full_row < full_rows ?
IsNaNOrInf(sparse_data[shift_sparse] * full[shift_full], 0) : 0);
res = LocalSum(res, 1, Temp);
//---
if(sparse_col == 0)
{
const int shift_result = RCtoFlat(sparse_row, full_col, sparse_rows, full_cols, 0);
result[shift_result] = res;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseMatMultGrad(__global const float *sparse_index,
__global const float *sparse_data,
__global float *sparse_gr,
__global const float *full,
__global float *full_gr,
__global const float *result_gr,
const int sparse_rows,
const int sparse_cols,
const int full_rows,
const int full_cols
)
{
const size_t row_id = get_global_id(0);
const size_t local_id = get_local_id(1);
const size_t col_id = get_global_id(2);
const size_t total_rows = get_global_size(0);
const size_t total_local = get_local_size(1);
const size_t total_cols = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//--- Calce sparse gradient
if(row_id < sparse_rows && col_id < sparse_cols)
{
float grad = 0;
int shift_sparse = 0;
if(local_id == 0)
{
shift_sparse = RCtoFlat(row_id, col_id, sparse_rows, sparse_cols, 0);
Temp[0] = sparse_index[shift_sparse];
}
BarrierLoc
uint full_row = (uint)Temp[0];
if(full_row < (uint)full_rows)
for(int i = local_id; i < full_cols; i += total_local)
{
int shift_result = RCtoFlat(row_id, i, sparse_rows, full_cols, 0);
int shift_full = RCtoFlat(full_row, i, full_rows, full_cols, 0);
grad += IsNaNOrInf(result_gr[shift_result] * full[shift_full], 0);
}
grad = LocalSum(grad, 1, Temp);
if(local_id == 0)
sparse_gr[shift_sparse] = grad;
}
//--- Calce full gradient
if(row_id < full_rows && col_id < full_cols)
{
float grad = 0;
for(int r = 0; r < sparse_rows; r ++)
{
float s = 0;
for(int c = local_id; c < sparse_cols; c += total_local)
{
int shift_sparse = RCtoFlat(r, c, sparse_rows, sparse_cols, 0);
if((int)sparse_index[shift_sparse] == (int)row_id)
{
s = sparse_data[shift_sparse];
break;
}
}
s = LocalSum(s, 1, Temp);
if(s != 0 && local_id == 0)
{
int shift_result = RCtoFlat(r, col_id, sparse_rows, full_cols, 0);
grad += IsNaNOrInf(s * result_gr[shift_result], 0);
}
}
if(local_id == 0)
{
int shift_full = RCtoFlat(row_id, col_id, full_rows, full_cols, 0);
full_gr[shift_full] = grad;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void RandomWalk(__global const float *data,
__global float *inv_diag,
__global float *norm,
const int total_cols
)
{
const size_t row_id = get_global_id(0);
const size_t local_id = get_local_id(1);
const size_t total_rows = get_global_size(0);
const size_t total_local = get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//---
float d = 0;
for(int c = local_id; c < total_cols; c += total_local)
{
int shift = RCtoFlat(row_id, c, total_rows, total_cols, 0);
d += IsNaNOrInf(data[shift], 0);
}
d = IsNaNOrInf(1.0f / (LocalSum(d, 1, Temp) + 1.0f), 1.0f);
if(local_id == 0)
inv_diag[row_id] = d;
//---
for(int c = local_id; c < total_cols; c += total_local)
{
int shift = RCtoFlat(row_id, c, total_rows, total_cols, 0);
norm[shift] = IsNaNOrInf(data[shift] * d, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ConcatByLabel(__global const float* data,
__global const float* label,
__global const float* embedding1,
__global const float* embedding2,
__global float *output,
const int dimension_data,
const int dimension_emb1,
const int dimension_emb2,
const int frame1,
const int frame2,
const int period1,
const int period2
)
{
const size_t row_id = get_global_id(0);
const size_t col_id = get_global_id(1);
const size_t buffer_id = get_global_id(2);
const size_t total_rows = get_global_size(0);
const size_t total_cols = get_global_size(1);
const size_t total_buffers = get_global_size(2);
//---
__global const float *buffer;
int dimension_in, dimension_out;
int shift_in, shift_out;
//---
switch(total_buffers)
{
case 1:
dimension_out = dimension_data;
break;
case 2:
dimension_out = dimension_data + dimension_emb1;
break;
case 3:
dimension_out = dimension_data + dimension_emb1 + dimension_emb2;
break;
default:
return;
}
//---
switch(buffer_id)
{
case 0:
buffer = data;
dimension_in = dimension_data;
shift_in = RCtoFlat(row_id, col_id, total_rows, dimension_in, 0);
shift_out = RCtoFlat(row_id, col_id, total_rows, dimension_out, 0);
break;
case 1:
buffer = embedding1;
dimension_in = dimension_emb1;
shift_in = ((int)IsNaNOrInf(label[row_id] / frame1, 0)) % period1;
shift_in = RCtoFlat(shift_in, col_id, period1, dimension_in, 0);
shift_out = RCtoFlat(row_id, dimension_data + col_id, total_rows, dimension_out, 0);
break;
case 2:
buffer = embedding2;
dimension_in = dimension_emb2;
shift_in = ((int)IsNaNOrInf(label[row_id] / frame2, 0)) % period2;
shift_in = RCtoFlat(shift_in, col_id, period2, dimension_in, 0);
shift_out = RCtoFlat(row_id, dimension_data + dimension_emb1 + col_id, total_rows, dimension_out, 0);
break;
}
//---
if(col_id < dimension_in)
output[shift_out] = IsNaNOrInf(buffer[shift_in], 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ConcatByLabelGrad(__global float* data_gr,
__global const float* label,
__global float* embedding1_gr,
__global float* embedding2_gr,
__global float *output_gr,
const int dimension_data,
const int dimension_emb1,
const int dimension_emb2,
const int frame1,
const int frame2,
const int period1,
const int period2,
const int units
)
{
const size_t row_id = get_global_id(0);
const size_t col_id = get_global_id(1);
const size_t buffer_id = get_global_id(2);
const size_t total_rows = get_global_size(0);
const size_t total_cols = get_global_size(1);
const size_t total_buffers = get_global_size(2);
//---
__global float *buffer;
int dimension_in, dimension_out;
int shift_in, shift_out, shift_col;
int period, frame, rows;
//---
switch(total_buffers)
{
case 1:
dimension_out = dimension_data;
break;
case 2:
dimension_out = dimension_data + dimension_emb1;
break;
case 3:
dimension_out = dimension_data + dimension_emb1 + dimension_emb2;
break;
default:
return;
}
//---
switch(buffer_id)
{
case 0:
if(col_id < dimension_data && row_id < units)
{
shift_in = RCtoFlat(row_id, col_id, total_rows, dimension_in, 0);
shift_out = RCtoFlat(row_id, col_id, total_rows, dimension_out, 0);
data_gr[shift_in] = IsNaNOrInf(output_gr[shift_out], 0);
}
return;
case 1:
rows = period1;
buffer = embedding1_gr;
dimension_in = dimension_emb1;
shift_in = RCtoFlat(row_id, col_id, period1, dimension_in, 0);
shift_col = dimension_data;
period = period1;
frame = frame1;
break;
case 2:
rows = period2;
buffer = embedding2_gr;
dimension_in = dimension_emb2;
shift_in = RCtoFlat(row_id, col_id, period2, dimension_in, 0);
shift_col = dimension_data + dimension_emb1;
period = period2;
frame = frame2;
break;
}
//---
if(row_id >= rows || col_id >= dimension_in)
return;
float grad = 0;
for(uint r = 0; r < total_rows; r ++)
{
int row = ((int)IsNaNOrInf(label[r] / frame, 0)) % period;
if(row != row_id)
continue;
shift_out = RCtoFlat(r, shift_col + col_id, total_rows, dimension_out, 0);
grad += IsNaNOrInf(output_gr[shift_out], 0);
}
buffer[shift_in] = IsNaNOrInf(grad, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GlobalLocalAttention(__global const float *q,
__global const float2* kv,
__global float *scores,
__global const float* mask,
__global const float* label,
__global float *out,
const int dimension,
const int total_kv,
const int total_mask
)
{
//--- init
const int q_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_q = get_global_size(0);
const int total_local = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Score
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
if(h_id % 2 == 0)
{
const int shift_kv = RCtoFlat(h_id, 0, total_heads, dimension, local_id);
const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, q_id);
float score = 0;
if(local_id < total_kv)
{
for(int d = 0; d < dimension; d++)
score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0);
}
else
score = MIN_VALUE;
//--- norm score
score = LocalSoftMax(score, 1, temp);
if(local_id < total_kv)
scores[shift_s] = score;
//--- out
for(int d = 0; d < dimension; d++)
{
float val = (local_id < total_kv ? kv[shift_kv + d].s1 * score : 0);
val = LocalSum(val, 1, temp);
if(local_id == 0)
out[shift_q + d] = val;
}
}
else
{
int kv_id = -1;
float score = 0;
int shift_kv = -1;
float m = 0;
const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id);
if(local_id < total_mask)
{
const int l = RCtoFlat(q_id, local_id, total_q, total_mask, 0);
kv_id = IsNaNOrInf(label[l], -1);
m = IsNaNOrInf(mask[l], 0);
shift_kv = RCtoFlat(h_id, 0, total_heads, dimension, kv_id);
if(kv_id >= 0)
for(int d = 0; d < dimension; d++)
score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0);
else
score = MIN_VALUE;
}
else
score = MIN_VALUE;
//--- norm score
score = LocalSoftMax(score * m, 1, temp);
if(local_id < total_mask)
scores[shift_s] = score;
//--- out
for(int d = 0; d < dimension; d++)
{
float val = (kv_id >= 0 ? IsNaNOrInf(kv[shift_kv + d].s1, 0) * score : 0);
val = LocalSum(val, 1, temp);
if(local_id == 0)
out[shift_q + d] = val;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GlobalLocalAttentionGrad(__global const float *q,
__global float *q_gr,
__global const float *kv,
__global float *kv_gr,
__global float *scores,
__global const float *mask,
__global float *mask_gr,
__global const float *label,
__global float *out_gr,
const int dimension,
const int total_q,
const int total_kv,
const int total_mask
)
{
//--- init
const int global_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_global = get_global_size(0);
const int total_local = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
if(h_id % 2 == 0)
{
//--- Value Gradient global_id -> v_id, local_id -> q_id
for(int d = 0; d < dimension; d++)
{
const int shift_v = RCtoFlat(h_id, 2 * d + 1, total_heads, 2 * dimension, global_id);
float grad = 0;
for(int q_id = local_id; q_id < total_q; q_id += total_local)
{
int shift_s = RCtoFlat(h_id / 2, global_id, total_heads / 2, total_kv + total_mask, q_id);
int shift_q = RCtoFlat(h_id, d, total_heads, dimension, q_id);
grad += IsNaNOrInf(scores[shift_s] * out_gr[shift_q], 0);
}
grad = LocalSum(grad, 1, temp);
kv_gr[shift_v] = grad;
}
//--- Query Gradient global_id -> q_id, local_id -> k_id/v_id
if(global_id < total_q)
{
//--- 1. Score grad
float grad_s = 0;
const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, local_id);
const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, global_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, global_id);
if(local_id < total_kv)
for(int d = 0; d < dimension; d++)
grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0);
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
//--- 3. Query grad
const int shift_k = shift_v - 1;
for(int d = 0; d < dimension; d++)
{
float grad = 0;
if(local_id < total_kv)
grad = kv[shift_k + 2 * d] * grad_s;
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
q_gr[shift_q + d] = grad;
}
}
//--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension
if(global_id < total_kv)
{
float grad = 0;
for(int q_id = 0; q_id < total_q; q_id++)
{
//--- 1. Score grad local_id -> score_id/v_id
float grad_s = 0;
const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, local_id);
const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, q_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
if(local_id < total_kv)
for(int d = 0; d < dimension; d++)
grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0);
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
BarrierLoc
if(global_id == local_id)
temp[0] = grad_s;
BarrierLoc
grad_s = temp[0];
//--- 3. Key grad local_id -> dimension
shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id);
if(local_id < dimension)
grad += IsNaNOrInf(q[shift_q] * grad_s, 0);
}
const int shift_k = RCtoFlat(h_id, 2 * local_id, total_heads, 2 * dimension, global_id);
if(local_id < dimension)
kv_gr[shift_k] = IsNaNOrInf(grad, 0);
}
}
else
{
//--- Value Gradient global_id -> v_id, local_id -> mask_index/dimension
if(global_id < total_kv)
{
float grad = 0;
for(int q_id = 0; q_id < total_q; q_id++)
{
//--- 1. kv_id
int kv_id = -1;
float m = 0;
const int l = RCtoFlat(q_id, local_id, total_q, total_mask, 0);
const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id);
//--- Check for use current Value
if(local_id < total_mask)
kv_id = (int)label[l];
if(local_id == 0)
temp[0] = 0;
BarrierLoc
if(kv_id == global_id)
temp[0] = scores[shift_s];
BarrierLoc
if(temp[0] == 0)
continue;
//--- Value grad
int shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id);
if(local_id < dimension)
grad += IsNaNOrInf(temp[0] * out_gr[shift_q], 0);
}
const int shift_v = RCtoFlat(h_id, 2 * local_id + 1, total_heads, 2 * dimension, global_id);
if(local_id < dimension)
kv_gr[shift_v] = IsNaNOrInf(grad, 0);
}
//--- Query Gradient global_id -> q_id, local_id -> mask label
if(global_id < total_q)
{
//--- 1. kv_id;
int kv_id = -1;
float m = 0;
const int l = RCtoFlat(global_id, local_id, total_q, total_mask, 0);
if(local_id < total_mask)
{
kv_id = (int)IsNaNOrInf(label[l], -1);
m = IsNaNOrInf(mask[l], 0);
}
//--- 2. Score grad
float grad_s = 0;
const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, kv_id);
const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, global_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, global_id);
if(local_id < total_mask)
for(int d = 0; d < dimension; d++)
grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0);
//--- 3. SoftMax grad
float score = IsNaNOrInf(scores[shift_s], 0);
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
mask_gr[l] = IsNaNOrInf(grad_s * score, 0);
grad_s *= m;
//--- 4. Query grad
const int shift_k = shift_v - 1;
for(int d = 0; d < dimension; d++)
{
float grad = 0;
if(local_id < total_mask)
grad = kv[shift_k + 2 * d] * grad_s;
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
q_gr[shift_q + d] = grad;
}
}
//--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension
if(global_id < total_kv)
{
float grad = 0;
for(int q_id = 0; q_id < total_q; q_id++)
{
//--- 1. kv_id;
int kv_id = -1;
float m = 0;
const int l = RCtoFlat(global_id, local_id, total_q, total_mask, 0);
if(local_id < total_mask)
{
kv_id = (int)label[l];
if(kv_id == global_id)
m = mask[l];
}
m = LocalSum(m, 1, temp);
if(m == 0)
continue;
//--- 2. Score grad local_id -> score_id/v_id
float grad_s = 0;
const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, kv_id);
const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
if(local_id < total_mask)
for(int d = 0; d < dimension; d++)
grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0);
//--- 3. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
BarrierLoc
if(global_id == local_id)
temp[0] = grad_s * m;
BarrierLoc
grad_s = temp[0];
//--- 4. Key grad local_id -> dimension
shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id);
if(local_id < dimension)
grad += IsNaNOrInf(q[shift_q] * grad_s, 0);
}
const int shift_k = RCtoFlat(h_id, 2 * local_id, total_heads, 2 * dimension, global_id);
if(local_id < dimension)
kv_gr[shift_k] = IsNaNOrInf(grad, 0);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseSoftMax(__global const float *data,
__global float *outputs,
__global float *indexes,
const int out_dimension
)
{
const size_t row = get_global_id(0);
const size_t col_in = get_local_id(1);
const int total_rows = (int)get_global_size(0);
const int total_cols_in = (int)get_local_size(1);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
const int ls = min(total_cols_in, (int)LOCAL_ARRAY_SIZE);
//---
const int shift_in = RCtoFlat(row, col_in, total_rows, total_cols_in, 0);
//--- calc position
float value = IsNaNOrInf(data[shift_in], MIN_VALUE);
int position = 0;
for(int l = 0; l < total_cols_in; l += ls)
{
if(col_in >= l && col_in < (l + ls))
Temp[col_in - l] = value;
BarrierLoc
for(int i = 0; i < ls; i++)
{
if(i == (col_in - l))
continue;
if(Temp[i] > value)
position++;
else
if(Temp[i] == value && i < (col_in - l))
position++;
}
BarrierLoc
}
//--- SoftMax
if(position >= out_dimension)
value = MIN_VALUE;
value = LocalSoftMax(value, 1, Temp);
//--- result
const int shift_out = RCtoFlat(row, position, total_rows, out_dimension, 0);
if(position < out_dimension)
{
outputs[shift_out] = value;
indexes[shift_out] = (float)col_in;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseSoftMaxGrad(__global float *data_gr,
__global const float *outputs,
__global const float *outputs_gr,
__global const float *indexes,
const int out_dimension
)
{
const size_t row = get_global_id(0);
const size_t col_in = get_local_id(1);
const int total_rows = (int)get_global_size(0);
const int total_cols_in = (int)get_local_size(1);
//---
__local int Ind[LOCAL_ARRAY_SIZE];
__local float Temp[LOCAL_ARRAY_SIZE];
const int ls = min(total_cols_in, (int)LOCAL_ARRAY_SIZE);
//--- look position
float value = 0;
float grad = 0;
int position = -1;
int idx = -1;
const int shift_idx = RCtoFlat(row, col_in, total_rows, out_dimension, 0);
if(col_in < out_dimension)
idx = (int)IsNaNOrInf(indexes[shift_idx], -1.0f);
for(int l = 0; l < out_dimension; l += ls)
{
if(col_in >= l && col_in < (l + ls))
Ind[col_in - l] = idx;
BarrierLoc
for(int i = 0; (i < ls && position < 0); i++)
{
if(Ind[i] == col_in)
position = l + i;
}
BarrierLoc
}
//--- SoftMax Grad
if(position < out_dimension && position >= 0)
{
const int shift_out = RCtoFlat(row, position, total_rows, out_dimension, 0);
value = IsNaNOrInf(outputs[shift_out], 0);
grad = IsNaNOrInf(outputs_gr[shift_out], 0);
}
grad = LocalSoftMaxGrad(value, grad, 1, Temp);
//--- result
const int shift_in = RCtoFlat(row, col_in, total_rows, total_cols_in, 0);
data_gr[shift_in] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FloatToSpike(__global float* values,
__global const float* levels,
__global float* outputs
)
{
const size_t id = get_global_id(0);
float val = IsNaNOrInf(values[id], 0.0f);
if(val == 0.0f)
outputs[id] = 0.0f;
else
{
const float lev = IsNaNOrInf(levels[id], 0.0f);
if(fabs(val) < lev)
outputs[id] = 0.0f;
else
{
outputs[id] = (float)sign(val);
values[id] = IsNaNOrInf(sign(val) * (fabs(val) - lev), 0.0f);
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FloatToSpikeGrad(__global const float* values,
__global float* values_gr,
__global float* levels_gr,
__global const float* gradients
)
{
const size_t id = get_global_id(0);
const float grad = IsNaNOrInf(gradients[id], 0.0f);
values_gr[id] = grad;
if(fabs(grad) > 0.0f)
{
float val = IsNaNOrInf(values[id], 0.0f);
levels_gr[id] = (float)(-sign(val) * grad);
}
else
levels_gr[id] = 0.0f;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SpikeMHAttention(__global const float *qkv,
__global const float *diag_bias,
__global float *scores,
__global float *out,
const int dimension,
const int mask_future
)
{
//--- init
const int q_id = get_global_id(0);
const int k_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_q = get_global_size(0);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Shifts
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * q_id);
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * k_id + 1);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * k_id + 2);
const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_q, q_id);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
//--- Score
float score = 0;
if(mask_future == 0 || q_id <= k_id)
{
for(int d = 0; d < dimension; d++)
{
float q = IsNaNOrInf(qkv[shift_q + d], 0);
if(q == 0)
continue;
float k = IsNaNOrInf(qkv[shift_k + d], 0);
if(k == 0)
continue;
score += q * k;
}
}
else
score = MIN_VALUE;
if(q_id == k_id)
score += IsNaNOrInf(diag_bias[q_id], 0);
//--- norm score
score = LocalSoftMax(score, 1, temp);
scores[shift_s] = score;
//--- out
for(int d = 0; d < dimension; d++)
{
float val = 0;
if(score > 0)
{
float v = IsNaNOrInf(qkv[shift_v + d], 0);
if(v != 0)
val = v * score;
}
val = LocalSum(val, 1, temp);
if(k_id == 0)
out[shift_out + d] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SpikeMHAttentionGrad(__global const float *qkv,
__global float *qkv_gr,
__global const float *diag_bias,
__global float *diag_bias_gr,
__global const float *scores,
__global const float *gradients,
const int dimension,
const int mask_future
)
{
//--- init
const int global_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_global = get_global_size(0);
const int total_local = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Value Gradient global_id -> v_id, local_id -> q_id
{
//--- Shifts
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id + 2);
const int shift_s = RCtoFlat(h_id, global_id, total_heads, total_global, local_id);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id);
for(int d = 0; d < dimension; d++)
{
float grad = 0;
if(mask_future == 0 || local_id <= global_id)
{
float score = IsNaNOrInf(scores[shift_s], 0);
if(score > 0)
grad = IsNaNOrInf(score * gradients[shift_out + d], 0);
}
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
qkv_gr[shift_v + d] = grad;
}
}
//--- Query Gradient global_id -> q_id, local_id -> k_id/v_id
{
//--- Shifts
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id);
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 1);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 2);
const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_local, global_id);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, global_id);
//--- 1. Score grad
float grad_s = 0;
if(mask_future == 0 || global_id <= local_id)
for(int d = 0; d < dimension; d++)
{
float val = IsNaNOrInf(qkv[shift_v + d], 0);
if(val == 0)
continue;
grad_s += IsNaNOrInf(qkv[shift_v + d] * gradients[shift_out + d], 0);
}
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
if(global_id == local_id)
diag_bias_gr[global_id] = grad_s;
//--- 3. Query grad
for(int d = 0; d < dimension; d++)
{
float grad = 0;
if(mask_future == 0 || global_id <= local_id)
{
float key = IsNaNOrInf(qkv[shift_k + d], 0);
if(key != 0)
grad = key * grad_s;
}
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
qkv_gr[shift_q + d] = grad;
}
}
//--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension
{
//--- Shifts
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id + 1);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 2);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id);
float grad = 0;
for(int q_id = 0; q_id < total_local; q_id++)
{
//--- 1. Score grad local_id -> score_id/v_id
float grad_s = 0;
const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_local, q_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * q_id);
if(mask_future == 0 || q_id <= local_id)
for(int d = 0; d < dimension; d++)
{
float val = IsNaNOrInf(qkv[shift_v + d], 0);
if(val == 0)
continue;
grad_s += IsNaNOrInf(val * gradients[shift_q + d], 0);
}
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
BarrierLoc
if(global_id == local_id)
temp[0] = grad_s;
BarrierLoc
grad_s = temp[0];
//--- 3. Key grad local_id -> dimension
if(local_id < dimension)
{
float query = IsNaNOrInf(qkv[shift_q + local_id], 0);
if(query != 0)
grad += IsNaNOrInf(query * grad_s, 0);
}
}
if(local_id < dimension)
qkv_gr[shift_k + local_id] = IsNaNOrInf(grad, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void STFS(__global const float* inputs,
__global const float* mask_time,
__global const float* mask_spatial,
__global float* outputs
)
{
const size_t time_id = get_global_id(0);
const size_t spat_id = get_global_id(1);
const size_t head = get_local_id(2);
const size_t total_times = get_global_size(0);
const size_t total_spats = get_global_size(1);
const size_t total_heads = get_local_size(2);
//---
__local float temp[3];
//---
const int shift_in = RCtoFlat(time_id, spat_id, total_times, total_spats, 1);
const int shift_out = RCtoFlat(time_id, spat_id, total_times, total_spats, head);
//---
switch(head)
{
case 0:
temp[0] = IsNaNOrInf(inputs[shift_in], 0);
break;
case 1:
temp[1] = IsNaNOrInf(mask_time[time_id], 0);
break;
case 2:
temp[2] = IsNaNOrInf(mask_spatial[spat_id], 0);
break;
}
BarrierLoc
float out = temp[0];
if(out != 0)
switch(head)
{
case 1:
out *= temp[1];
break;
case 2:
out *= (1 - temp[1]);
break;
case 3:
out *= temp[2];
break;
case 4:
out *= (1 - temp[2]);
break;
}
//---
outputs[shift_out] = IsNaNOrInf(out, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void STFSGrad(__global float* inputs_gr,
__global const float* mask_time,
__global const float* mask_spatial,
__global const float* outputs_gr
)
{
const size_t time_id = get_global_id(0);
const size_t spat_id = get_global_id(1);
const size_t head = get_local_id(2);
const size_t total_times = get_global_size(0);
const size_t total_spats = get_global_size(1);
const size_t total_heads = get_local_size(2);
//---
__local float temp[5];
//---
const int shift_in = RCtoFlat(time_id, spat_id, total_times, total_spats, 1);
const int shift_out = RCtoFlat(time_id, spat_id, total_times, total_spats, head);
//---
switch(head)
{
case 0:
temp[1] = IsNaNOrInf(mask_time[time_id], 0);
break;
case 1:
temp[2] = IsNaNOrInf(mask_spatial[spat_id], 0);
break;
}
BarrierLoc
float grad = IsNaNOrInf(outputs_gr[shift_out], 0);
if(grad != 0)
switch(head)
{
case 1:
grad *= temp[1];
break;
case 2:
grad *= (1 - temp[1]);
break;
case 3:
grad *= temp[2];
break;
case 4:
grad *= (1 - temp[2]);
break;
}
//---
grad = LocalSum(grad, 2, temp);
BarrierLoc
if(head == 0)
inputs_gr[shift_in] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AddToStack(__global const float* inputs,
__global float* stack,
const int stack_size)
{
const size_t id = get_global_id(0);
const size_t loc_id = get_local_id(1);
const size_t var = get_global_id(2);
const size_t dimension = get_global_size(0);
const size_t total_loc = get_local_size(1);
const size_t variables = get_global_size(2);
//---
const int total = (stack_size - 1) / total_loc;
for(int i = total; i >= 0; i--)
{
int inp = 0;
if(i == 0 && loc_id == 0)
inp = IsNaNOrInf(inputs[RCtoFlat(var, id, variables, dimension, 1)], 0);
else
if((i * total_loc + loc_id) < stack_size)
{
int shift = RCtoFlat(i * total_loc + loc_id - 1, id, stack_size, dimension, var);
inp = IsNaNOrInf(stack[shift], 0);
}
BarrierLoc
if((i * total_loc + loc_id) < stack_size)
{
int shift = RCtoFlat(i * total_loc + loc_id, id, stack_size, dimension, var);
stack[shift] = inp;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AggregationByTime(__global const float* inputs,
__global const float* stack,
__global float* outputs,
const int stack_size,
const int levels
)
{
const size_t id = get_global_id(0);
const size_t var = get_global_id(1);
const size_t dimension = get_global_size(0);
const size_t variables = get_global_size(1);
//---
float val = IsNaNOrInf(inputs[RCtoFlat(var, id, variables, dimension, 0)], 0);
outputs[RCtoFlat(var, id, variables, dimension, 0)] = val;
for(int l = 1; l < levels; l++)
{
int total = 1 << l;
int start = total - 1;
val /= total;
for(int s = 0; s < total; s++)
{
if(s + start >= stack_size)
continue;
val += IsNaNOrInf(stack[RCtoFlat(var, id, variables * levels, dimension, start + s)] / total, 0);
}
outputs[RCtoFlat(var, id, variables, dimension, l)] = IsNaNOrInf(val, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void AggregationByTimeGrad(__global float* inputs_gr,
__global const float* outputs_gr,
const int levels
)
{
const size_t id = get_global_id(0);
const size_t var = get_global_id(1);
const size_t dimension = get_global_size(0);
const size_t variables = get_global_size(1);
//---
float grad = 0;
for(int l = 0; l < levels; l++)
{
int total = 1 << l;
grad += IsNaNOrInf(outputs_gr[RCtoFlat(var, id, variables, dimension, l)] / total, 0);
}
inputs_gr[RCtoFlat(var, id, variables, dimension, 0)] = IsNaNOrInf(grad, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GRU(__global const float* XH,
__global const float* prev_state,
__global float* outputs
)
{
const size_t id = get_global_id(0);
const size_t d = get_global_id(1);
const size_t units = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
const float xz = IsNaNOrInf(XH[RCtoFlat(0, d, 6, dimension, id)], 0);
const float xr = IsNaNOrInf(XH[RCtoFlat(1, d, 6, dimension, id)], 0);
const float xh = IsNaNOrInf(XH[RCtoFlat(2, d, 6, dimension, id)], 0);
const float hz = IsNaNOrInf(XH[RCtoFlat(3, d, 6, dimension, id)], 0);
const float hr = IsNaNOrInf(XH[RCtoFlat(4, d, 6, dimension, id)], 0);
const float hh = IsNaNOrInf(XH[RCtoFlat(5, d, 6, dimension, id)], 0);
const float prev = IsNaNOrInf(prev_state[RCtoFlat(id, d, units, dimension, 0)], 0);
//---
float r = fActivation(xr + hr, ActFunc_SIGMOID);
float z = fActivation(xz + hz, ActFunc_SIGMOID);
float ht = fActivation(r * hh + xh, ActFunc_TANH);
float out = (1 - z) * prev + z * ht;
//---
outputs[RCtoFlat(id, d, units, dimension, 0)] = IsNaNOrInf(out, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void GRU_Grad(__global const float* XH,
__global float * XH_gr,
__global const float* prev_state,
__global const float* outputs_gr
)
{
const size_t id = get_global_id(0);
const size_t d = get_global_id(1);
const size_t units = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
const float xz = IsNaNOrInf(XH[RCtoFlat(0, d, 6, dimension, id)], 0);
const float xr = IsNaNOrInf(XH[RCtoFlat(1, d, 6, dimension, id)], 0);
const float xh = IsNaNOrInf(XH[RCtoFlat(2, d, 6, dimension, id)], 0);
const float hz = IsNaNOrInf(XH[RCtoFlat(3, d, 6, dimension, id)], 0);
const float hr = IsNaNOrInf(XH[RCtoFlat(4, d, 6, dimension, id)], 0);
const float hh = IsNaNOrInf(XH[RCtoFlat(5, d, 6, dimension, id)], 0);
const float prev = IsNaNOrInf(prev_state[RCtoFlat(id, d, units, dimension, 0)], 0);
const float grad = IsNaNOrInf(outputs_gr[RCtoFlat(id, d, units, dimension, 0)], 0);
//---
float r = fActivation(xr + hr, ActFunc_SIGMOID);
float z = fActivation(xz + hz, ActFunc_SIGMOID);
float ht = fActivation(r * hh + xh, ActFunc_TANH);
//---
float ht_grad = IsNaNOrInf(grad * z, 0);
float z_grad = IsNaNOrInf(grad * (ht - prev), 0);
float xh_grad = Deactivation(ht_grad, ht, ActFunc_TANH);
float hh_grad = IsNaNOrInf(xh_grad * r, 0);
float r_grad = IsNaNOrInf(xh_grad * hh, 0);
float xz_grad = Deactivation(z_grad, z, ActFunc_SIGMOID);
float hz_grad = xz_grad;
float xr_grad = Deactivation(r_grad, r, ActFunc_SIGMOID);
float hr_grad = xr_grad;
//---
XH_gr[RCtoFlat(0, d, 6, dimension, id)] = IsNaNOrInf(xz_grad, 0);
XH_gr[RCtoFlat(1, d, 6, dimension, id)] = IsNaNOrInf(xr_grad, 0);
XH_gr[RCtoFlat(2, d, 6, dimension, id)] = IsNaNOrInf(xh_grad, 0);
XH_gr[RCtoFlat(3, d, 6, dimension, id)] = IsNaNOrInf(hz_grad, 0);
XH_gr[RCtoFlat(4, d, 6, dimension, id)] = IsNaNOrInf(hr_grad, 0);
XH_gr[RCtoFlat(5, d, 6, dimension, id)] = IsNaNOrInf(hh_grad, 0);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ScalarToVector(__global const float* scalar,
__global const float* vector_in,
__global float* vector_out
)
{
const size_t vec = get_global_id(0);
const size_t d = get_global_id(1);
const size_t vectors = get_global_size(0);
const size_t dimension = get_global_size(1);
//---
float sc = IsNaNOrInf(scalar[vec], 0.0f);
int shift = RCtoFlat(vec, d, vectors, dimension, 0);
float v = IsNaNOrInf(vector_in[shift], 0.0f);
//---
vector_out[shift] = IsNaNOrInf(sc * v, 0.0f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void ScalarToVectorGrad(__global const float* scalar,
__global float* scalar_gr,
__global const float* vector_in,
__global float* vector_in_gr,
__global float* vector_out_gr,
const int dimension
)
{
const size_t vec = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t vectors = get_global_size(0);
const size_t total_loc = get_local_size(1);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
if(loc == 0)
temp[0] = IsNaNOrInf(scalar[vec], 0.0f);
BarrierLoc
float sc = temp[0];
float sc_gr = 0;
for(int d = loc; d < dimension; d += total_loc)
{
int shift = RCtoFlat(vec, d, vectors, dimension, 0);
float v = IsNaNOrInf(vector_in[shift], 0.0f);
float grad = IsNaNOrInf(vector_out_gr[shift], 0.0f);
vector_in_gr[shift] = IsNaNOrInf(grad * sc, 0.0f);
sc_gr += IsNaNOrInf(v * grad, 0.0f);
}
//---
sc_gr = LocalSum(sc_gr, 1, temp);
if(loc == 0)
scalar_gr[vec] = sc_gr;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcFlow(__global const float* value,
__global float* prev_value,
__global float* flow
)
{
const size_t id = get_global_id(0);
const size_t total = get_global_size(0);
//---
const float v = IsNaNOrInf(value[id], 0);
const float p = IsNaNOrInf(prev_value[id], 0);
flow[id] = IsNaNOrInf(v - p, 0);
prev_value[id] = v;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DilatedCorrelation(__global const float* feature,
__global const int* shifts,
__global float* correlations,
const int dimension
)
{
const size_t main = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t sh = get_global_id(2);
const size_t units = get_global_size(0);
const size_t total_loc = get_local_size(1);
const size_t total_corr = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
const int slave = main + shifts[sh >> 1] * ((sh & 1) ? -1 : 1);
if(slave < 0 || slave >= units)
{
if(loc == 0)
correlations[RCtoFlat(main, sh, units, total_corr, 0)] = 0;
return;
}
//---
float result = 0.0f;
for(int d = loc; d < dimension; d += total_loc)
{
float value_main = IsNaNOrInf(feature[RCtoFlat(main, d, units, dimension, 0)], 0);
float value_slave = IsNaNOrInf(feature[RCtoFlat(slave, d, units, dimension, 0)], 0);
result += IsNaNOrInf(value_main * value_slave, 0);
}
result = LocalSum(result, 1, temp);
//---
if(loc == 0)
correlations[RCtoFlat(main, sh, units, total_corr, 0)] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DilatedCorrelationGrad(__global const float* feature,
__global float* feature_gr,
__global const int* shifts,
__global const float* corr_gr,
const int total_corr
)
{
const size_t id = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t d = get_global_id(2);
const size_t units = get_global_size(0);
const size_t total_loc = get_local_size(1);
const size_t dimension = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
float result = 0.0f;
for(int sh = loc; sh < total_corr; sh += total_loc)
{
const int offset = shifts[sh >> 1];
const int sign = (sh & 1) ? -1 : +1;
// id — main
int slave = id + sign * offset;
if(slave >= 0 && slave < units)
{
float g = corr_gr[RCtoFlat(id, sh, units, total_corr, 0)];
result += IsNaNOrInf(g * feature[RCtoFlat(slave, d, units, dimension, 0)], 0.0f);
}
// id — slave
int main = id - sign * offset;
if(main >= 0 && main < units)
{
float g = corr_gr[RCtoFlat(main, sh, units, total_corr, 0)];
result += IsNaNOrInf(g * feature[RCtoFlat(main, d, units, dimension, 0)], 0.0f);
}
}
result = LocalSum(result, 1, temp);
if(loc == 0)
feature_gr[RCtoFlat(id, d, units, dimension, 0)] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DilatedDifference(__global const float* feature,
__global const int* shifts,
__global float* differences
)
{
const size_t main = get_global_id(0);
const size_t sh = get_global_id(1);
const size_t d = get_global_id(2);
const size_t units = get_global_size(0);
const size_t total_shifts = get_global_size(1);
const size_t dimension = get_global_size(2);
//---
const int slave = main + shifts[sh];
if(slave < 0 || slave >= units)
{
differences[RCtoFlat(main, d, units, dimension, sh)] = 0;
return;
}
//---
float value_main = IsNaNOrInf(feature[RCtoFlat(main, d, units, dimension, 0)], 0);
float value_slave = IsNaNOrInf(feature[RCtoFlat(slave, d, units, dimension, 0)], 0);
float result = value_main - value_slave;
//---
differences[RCtoFlat(main, d, units, dimension, sh)] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void DilatedDifferenceGrad(__global const float* feature,
__global float* feature_gr,
__global const int* shifts,
__global const float* differences_gr,
const int total_shifts
)
{
const size_t id = get_global_id(0);
const size_t loc = get_local_id(1);
const size_t d = get_global_id(2);
const size_t units = get_global_size(0);
const size_t total_loc = get_local_size(1);
const size_t dimension = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//---
float result = 0.0f;
for(int sh = loc; sh < total_shifts; sh += total_loc)
{
const int offset = shifts[sh];
int slave = id + offset;
if(slave >= 0 && slave < units)
{
// id — main
result += IsNaNOrInf(differences_gr[RCtoFlat(id, d, units, dimension, sh)], 0.0f);
// id — slave
result -= IsNaNOrInf(differences_gr[RCtoFlat(slave, d, units, dimension, sh)], 0.0f);
}
}
result = LocalSum(result, 1, temp);
if(loc == 0)
feature_gr[RCtoFlat(id, d, units, dimension, 0)] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PerturbedMatrix(__global const float* inputs,
__global const float* perturb,
__global float* output,
const float perturb_mult)
{
const size_t id = get_global_id(0);
const size_t var = get_global_id(1);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
//---
int shift = RCtoFlat(var, id, variables, total, 0);
output[shift] = IsNaNOrInf(inputs[shift] + perturb[shift] * perturb_mult, 0.0f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void PerturbedMatrixGrad(__global float* inputs_gr,
__global float* perturb_gr,
__global const float* output_gr,
const float perturb_mult)
{
const size_t id = get_global_id(0);
const size_t var = get_global_id(1);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
//---
int shift = RCtoFlat(var, id, variables, total, 0);
float grad = IsNaNOrInf(output_gr[shift], 0.0f);
inputs_gr[shift] = grad;
perturb_gr[shift] = IsNaNOrInf(perturb_mult * grad, 0.0f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LinearUpsample(__global const float* data,
__global float* upsample)
{
const size_t id_ltr = get_global_id(0);
const size_t var = get_global_id(1);
const size_t id_htr = get_global_id(2);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
const size_t dimension_htr = get_global_size(2);
//---
const float ltr = IsNaNOrInf(data[RCtoFlat(id_ltr, var, total, variables, 0)], 0.0f);
const float prev_ltr = (id_ltr > 0 ? IsNaNOrInf(data[RCtoFlat(id_ltr - 1, var, total, variables, 0)], 0.0f) : 0.0f);
const float htr = (id_htr < (dimension_htr - 1) ?
(float)id_htr / (float)(dimension_htr - 1) * (ltr - prev_ltr) + prev_ltr :
ltr);
//---
upsample[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr)] = IsNaNOrInf(htr, 0.0f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LinearUpsampleGrad(__global float* data_gr,
__global const float* upsample_gr,
const int dimension_htr)
{
const size_t id_ltr = get_global_id(0);
const size_t var = get_global_id(1);
const size_t id_loc = get_local_id(2);
const size_t total = get_global_size(0);
const size_t variables = get_global_size(1);
const size_t total_loc = get_local_size(2);
float grad = 0.0f;
//---
__local float temp[LOCAL_ARRAY_SIZE];
// --- main ltr
{
for(int id_htr = id_loc; id_htr < dimension_htr; id_htr += total_loc)
{
const float g =
upsample_gr[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr)];
if(id_htr < dimension_htr - 1)
{
const float t = (float)id_htr / (float)(dimension_htr - 1);
grad += g * t;
}
else
grad += g;
}
}
// --- prev ltr
if(id_ltr + 1 < total)
for(int id_htr = id_loc; id_htr < dimension_htr; id_htr += total_loc)
if(id_htr < dimension_htr - 1)
{
const float g =
upsample_gr[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr + 1)];
const float t = (float)id_htr / (float)(dimension_htr - 1);
grad += g * (1.0f - t);
}
// ---
grad = LocalSum(grad, 2, temp);
if(id_loc == 0)
data_gr[RCtoFlat(id_ltr, var, total, variables, 0)] = grad;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MixExpertsPredict(__global const float4* __attribute__((aligned(16))) experts,
__global float* outputs
)
{
const size_t id = get_global_id(0);
//---
float4 expert = experts[id];
float mu = IsNaNOrInf(expert.s0, 0.0f);
float alpha = fActivation(expert.s1, ActFunc_SoftPlus);
float sigma = fActivation(expert.s2, ActFunc_SoftPlus);
float txi = fActivation(expert.s3, ActFunc_TANH);
float out = mu + alpha * sigma * txi;
//---
outputs[id] = IsNaNOrInf(out, 0.0f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MixExpertsPredictGrad(__global const float4* __attribute__((aligned(16))) experts,
__global float4* __attribute__((aligned(16))) experts_gr,
__global const float* outputs_gr
)
{
const size_t id = get_global_id(0);
//---
float4 expert = experts[id];
float grad = IsNaNOrInf(outputs_gr[id], 0.0f);
float4 expert_gr = (float4)0.0f;
//---
float alpha = fActivation(expert.s1, ActFunc_SoftPlus);
float sigma = fActivation(expert.s2, ActFunc_SoftPlus);
float txi = fActivation(expert.s3, ActFunc_TANH);
//---
float mu_grad = grad;
float alpha_grad = Deactivation(grad * sigma * txi, alpha, ActFunc_SoftPlus);
float sigma_grad = Deactivation(grad * alpha * txi, sigma, ActFunc_SoftPlus);
float txi_grad = Deactivation(grad * sigma * alpha, txi, ActFunc_TANH);
//---
experts_gr[id] = (float4)(mu_grad, alpha_grad, sigma_grad, txi_grad);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFAT(__global const float *q,
__global const float *kv,
__global const float *scale,
__global float *scores,
__global float *out,
const int dimension,
const int mask_future
)
{
//--- init
const int q_id = get_global_id(0);
const int k_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_q = get_global_size(0);
const int total_k = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Shifts
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id + 1);
const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_k, q_id);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
//--- Score
float score = 0;
if(mask_future == 0 || q_id <= k_id)
{
float sc = IsNaNOrInf(scale[shift_s], 0.0f);
if(sc != 0)
{
for(int d = 0; d < dimension; d++)
{
float q_ = IsNaNOrInf(q[shift_q + d], 0.0f);
if(q_ == 0)
continue;
float k = IsNaNOrInf(kv[shift_k + d], 0.0f);
if(k == 0)
continue;
score += q_ * k;
}
score *= sc;
}
else
score = MIN_VALUE;
}
else
score = MIN_VALUE;
//--- norm score
score = LocalSoftMax(score, 1, temp);
scores[shift_s] = score;
//--- out
for(int d = 0; d < dimension; d++)
{
float val = 0;
if(score > 0)
{
float v = IsNaNOrInf(kv[shift_v + d], 0);
if(v != 0)
val = v * score;
}
val = LocalSum(val, 1, temp);
if(k_id == 0)
out[shift_out + d] = val;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFATGrad(__global const float *q,
__global float *q_gr,
__global const float *kv,
__global float *kv_gr,
__global const float *scale,
__global float *scale_gr,
__global const float *scores,
__global const float *gradients,
const int dimension,
const int total_k,
const int mask_future
)
{
//--- init
const int global_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_global = get_global_size(0);
const int total_local = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
//--- Value Gradient global_id -> v_id, local_id -> q_id
for(int d = 0; d < dimension; d++)
{
for(int v_id = global_id; v_id < total_k; v_id += total_global)
{
float grad = 0;
//--- Shifts
const int shift_v = RCtoFlat(h_id, d, total_heads, dimension, 2 * v_id + 1);
for(int q_id = 0; q_id < total_global; q_id += total_local)
{
const int shift_s = RCtoFlat(h_id, v_id, total_heads, total_k, q_id + local_id);
const int shift_out = RCtoFlat(h_id, d, total_heads, dimension, q_id + local_id);
if((q_id + local_id) < total_global)
if(mask_future == 0 || (q_id + local_id) <= v_id)
{
float score = IsNaNOrInf(scores[shift_s], 0.0f);
if(score > 0)
grad += IsNaNOrInf(score * gradients[shift_out], 0.0f);
}
}
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
kv_gr[shift_v] = grad;
}
}
//--- Query Gradient global_id -> q_id, local_id -> k_id/v_id
for(int d_q = 0; d_q < dimension; d_q++)
{
//--- Shifts
const int shift_q = RCtoFlat(h_id, d_q, total_heads, dimension, global_id);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, global_id);
float grad = 0;
for(int id = 0; id < total_k; id += total_local)
{
int k_id = id + local_id;
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id + 1);
const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_k, global_id);
//--- 1. Score grad
float grad_s = 0;
float score = 0;
float sc = 0;
if(k_id < total_k)
{
if(mask_future == 0 || global_id <= local_id)
for(int d = 0; d < dimension; d++)
{
float val = IsNaNOrInf(kv[shift_v + d], 0);
if(val == 0.0f)
continue;
grad_s += IsNaNOrInf(kv[shift_v + d] * gradients[shift_out + d], 0);
}
score = scores[shift_s];
sc = IsNaNOrInf(scale[shift_s], 0.0f);
}
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(score, grad_s, 1, temp);
float grad_sc = LocalSum(score * grad_s, 1, temp);
if(local_id == 0 && k_id < total_k)
{
if(sc != 0.0f)
scale_gr[shift_s] = grad_sc * log(scores[shift_s]) / sc;
else
scale_gr[shift_s] = grad_sc;
}
grad_s *= sc;
//--- 3. Query grad
if(grad_s != 0.0f)
if(mask_future == 0 || global_id <= k_id)
{
float key = IsNaNOrInf(kv[shift_k + d_q], 0.0f);
if(key != 0.0f)
grad += key * grad_s;
}
}
grad = LocalSum(grad, 1, temp);
if(local_id == 0)
q_gr[shift_q] = grad;
}
//--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension
for(int k_id = global_id; k_id < total_k; k_id += total_global)
{
//--- Shifts
const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id);
const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * local_id + 1);
const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id);
float grad = 0;
for(int q_id = 0; q_id < total_global; q_id++)
{
//--- 1. Score grad local_id -> score_id/v_id
float grad_s = 0;
const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_k, q_id);
int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
float score = 0;
float sc = 0;
if(local_id < total_k)
{
if(mask_future == 0 || q_id <= local_id)
for(int d = 0; d < dimension; d++)
{
float val = IsNaNOrInf(kv[shift_v + d], 0);
if(val == 0)
continue;
grad_s += IsNaNOrInf(val * gradients[shift_q + d], 0);
}
score = scores[shift_s];
sc = IsNaNOrInf(scale[shift_s], 0.0f);
}
//--- 2. SoftMax grad
grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp);
grad_s *= sc;
//--- 3. Key grad local_id -> dimension
if(local_id < dimension)
{
float query = IsNaNOrInf(q[shift_q + local_id], 0);
if(query != 0)
grad += IsNaNOrInf(query * grad_s, 0);
}
}
if(local_id < dimension)
kv_gr[shift_k + local_id] = IsNaNOrInf(grad, 0);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseConcatenate(__global const float *sparse_index,
__global const float *sparse_data,
__global const float *full,
__global float *result,
const int full_rows
)
{
const size_t sparse_row = get_global_id(0);
const size_t sparse_col = get_global_id(1);
const size_t full_col = get_global_id(2);
const size_t sparse_rows = get_global_size(0);
const size_t sparse_cols = get_global_size(1);
const size_t full_cols = get_global_size(2);
//---
const int shift_sparse = RCtoFlat(sparse_row, sparse_col, sparse_rows, sparse_cols, 0);
const int full_row = sparse_index[shift_sparse];
const int shift_full = RCtoFlat(full_row, full_col, full_rows, full_cols, 0);
const int shift_out = RCtoFlat(sparse_col, full_col, sparse_cols, full_cols, sparse_row);
//---
float res = (full_row >= 0 && full_row < full_rows ?
IsNaNOrInf(sparse_data[shift_sparse] * full[shift_full], 0) : 0.0f);
result[shift_out] = res;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SparseConcatenateGrad(__global const float *sparse_index,
__global const float *sparse_data,
__global float *sparse_gr,
__global const float *full,
__global float *full_gr,
__global const float *result_gr,
const int sparse_rows,
const int sparse_cols,
const int full_rows,
const int full_cols
)
{
const size_t row_id = get_global_id(0);
const size_t local_id = get_local_id(1);
const size_t col_id = get_global_id(2);
const size_t total_rows = get_global_size(0);
const size_t total_local = get_local_size(1);
const size_t total_cols = get_global_size(2);
//---
__local float Temp[LOCAL_ARRAY_SIZE];
//--- Calce sparse gradient
if(row_id < sparse_rows && col_id < sparse_cols)
{
float grad = 0;
int shift_sparse = 0;
if(local_id == 0)
{
shift_sparse = RCtoFlat(row_id, col_id, sparse_rows, sparse_cols, 0);
Temp[0] = sparse_index[shift_sparse];
}
BarrierLoc
uint full_row = (uint)Temp[0];
if(full_row < (uint)full_rows)
for(int i = local_id; i < full_cols; i += total_local)
{
int shift_out = RCtoFlat(col_id, i, sparse_cols, full_cols, row_id);
int shift_full = RCtoFlat(full_row, i, full_rows, full_cols, 0);
grad += IsNaNOrInf(result_gr[shift_out] * full[shift_full], 0.0f);
}
grad = LocalSum(grad, 1, Temp);
if(local_id == 0)
sparse_gr[shift_sparse] = grad;
}
//--- Calce full gradient
if(row_id < full_rows && col_id < full_cols)
{
float grad = 0;
for(int r = 0; r < sparse_rows; r ++)
{
float s = 0;
for(int c = local_id; c < sparse_cols; c += total_local)
{
int shift_sparse = RCtoFlat(r, c, sparse_rows, sparse_cols, 0);
if((uint)sparse_index[shift_sparse] == (uint)row_id)
{
s = sparse_data[shift_sparse];
int shift_out = RCtoFlat(c, col_id, sparse_cols, full_cols, r);
grad += IsNaNOrInf(s * result_gr[shift_out], 0.0f);
break;
}
}
}
grad = LocalSum(grad, 1, Temp);
if(local_id == 0)
{
int shift_full = RCtoFlat(row_id, col_id, full_rows, full_cols, 0);
full_gr[shift_full] = grad;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFlashAttention(__global const float *query,
__global const float *key_value,
__global float *logsumexp,
__global float *output,
const int dimension,
const int total_kv,
const int mask_future
)
{
//--- init
const int q_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_q = get_global_size(0);
const int total_loc = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
__local float4 temp4[LOCAL_ARRAY_SIZE];
//---
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
float prev_max = MIN_VALUE;
float sumexp = 0;
float out = 0;
for(int id = local_id; id < total_kv; id += total_loc)
{
int k_id = id + local_id;
const int shift_k = RCtoFlat(h_id, 0, 2 * total_heads, dimension, k_id);
const int shift_v = RCtoFlat(h_id + total_heads, 0, 2 * total_heads, dimension, k_id);
//--- Score
float score = 0;
if(k_id < total_kv && (mask_future == 0 || q_id <= k_id))
{
for(int d = 0; d < dimension; d += 4)
{
float4 q = IsNaNOrInf4((float4)(
(d < dimension ? query[shift_q + d] : 0.0f),
((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f)
), 0.0f);
float4 k = IsNaNOrInf4((float4)(
(d < dimension ? key_value[shift_k + d] : 0.0f),
((d + 1) < dimension ? key_value[shift_k + d + 1] : 0.0f),
((d + 2) < dimension ? key_value[shift_k + d + 2] : 0.0f),
((d + 3) < dimension ? key_value[shift_k + d + 3] : 0.0f)
), 0.0f);
score += IsNaNOrInf(dot(q, k), 0.0f);
}
score /= sqrt((float)dimension);
}
else
score = MIN_VALUE;
//--- norm score
float max = fmax(prev_max, LocalMax(score, 1, temp));
if(score > MIN_VALUE)
score = exp(score - max);
else
score = 0.0f;
if(sumexp == 0.0f)
sumexp = LocalSum(score, 1, temp);
else
sumexp = IsNaNOrInf(exp(prev_max - max) * sumexp + LocalSum(score, 1, temp), 0.0f);
for(int d = 0; d < dimension; d += 4)
{
float4 val = (float4)0.0f;
if(score > 0.0f && k_id < total_kv)
{
float4 v = (float4)(
(d < dimension ? key_value[shift_v + d] : 0.0f),
((d + 1) < dimension ? key_value[shift_v + d + 1] : 0.0f),
((d + 2) < dimension ? key_value[shift_v + d + 2] : 0.0f),
((d + 3) < dimension ? key_value[shift_v + d + 3] : 0.0f)
);
val = IsNaNOrInf4(v * score, 0.0f);
}
val = LocalSum4(val, 1, temp4);
int idx = local_id - d;
if(idx >= 0 && idx < 4)
{
if(out != 0.0f)
out = IsNaNOrInf(exp(prev_max - max) * out + val[idx], 0.0f);
else
out = val[idx];
}
}
prev_max = max;
}
if(local_id < dimension)
{
if(sumexp > 0.0f)
output[shift_q + local_id] = IsNaNOrInf(out / sumexp, 0.0f);
else
output[shift_q + local_id] = 0.0f;
}
if(local_id == 0)
{
int shift_logse = RCtoFlat(q_id, h_id, total_q, total_heads, 0);
if(sumexp > 0.0f)
logsumexp[shift_logse] = IsNaNOrInf(prev_max + log(sumexp), 0.0f);
else
logsumexp[shift_logse] = 0.0f;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFlashAttentionGrad(__global const float *query,
__global float *query_gr,
__global const float *key_value,
__global float *key_value_gr,
__global const float *logsumexp,
__global const float *output,
__global const float *output_gr,
const int dimension,
const int total_q,
const int total_kv,
const int mask_future
)
{
const int id = get_global_id(0);
const int d_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_heads = get_global_size(2);
__local float temp[LOCAL_ARRAY_SIZE];
//--- Query gradient: dQ[q, d]
if(id < total_q)
{
const int q_id = id;
const int shift_q = RCtoFlat(h_id, d_id, total_heads, dimension, q_id);
const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0);
const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f);
const float q_d = IsNaNOrInf(query[shift_q], 0.0f);
const float g_d = IsNaNOrInf(output_gr[shift_q], 0.0f);
const float o_d = IsNaNOrInf(output[shift_q], 0.0f);
const float D = LocalSum(IsNaNOrInf(g_d * o_d, 0.0f), 1, temp);
float grad_q = 0.0f;
for(int k_id = 0; k_id < total_kv; k_id++)
{
if(mask_future != 0 && q_id > k_id)
continue;
const int shift_k = RCtoFlat(h_id, d_id, 2 * total_heads, dimension, k_id);
const int shift_v = RCtoFlat(h_id + total_heads, d_id, 2 * total_heads, dimension, k_id);
const float k_d = IsNaNOrInf(key_value[shift_k], 0.0f);
const float v_d = IsNaNOrInf(key_value[shift_v], 0.0f);
const float s = LocalSum(IsNaNOrInf(q_d * k_d, 0.0f), 1, temp) / sqrt((float)dimension);
const float p = IsNaNOrInf(exp(clamp(s - lse, -120.0f, 0.0f)), 0.0f);
if(p == 0.0f)
continue;
const float dp = LocalSum(IsNaNOrInf(g_d * v_d, 0.0f), 1, temp);
const float ds = IsNaNOrInf(p * (dp - D), 0.0f);
grad_q += IsNaNOrInf(k_d * ds, 0.0f);
}
query_gr[shift_q] = IsNaNOrInf(grad_q, 0.0f);
}
//--- Key & Value gradients: dK[k, d], dV[k, d]
if(id < total_kv)
{
const int k_id = id;
const int shift_k = RCtoFlat(h_id, d_id, 2 * total_heads, dimension, k_id);
const int shift_v = RCtoFlat(h_id + total_heads, d_id, 2 * total_heads, dimension, k_id);
const float k_d = IsNaNOrInf(key_value[shift_k], 0.0f);
const float v_d = IsNaNOrInf(key_value[shift_v], 0.0f);
float grad_k = 0.0f;
float grad_v = 0.0f;
for(int q_id = 0; q_id < total_q; q_id++)
{
if(mask_future != 0 && q_id > k_id)
continue;
const int shift_q = RCtoFlat(h_id, d_id, total_heads, dimension, q_id);
const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0);
const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f);
const float q_d = IsNaNOrInf(query[shift_q], 0.0f);
const float g_d = IsNaNOrInf(output_gr[shift_q], 0.0f);
const float o_d = IsNaNOrInf(output[shift_q], 0.0f);
const float D = LocalSum(IsNaNOrInf(g_d * o_d, 0.0f), 1, temp);
const float s = LocalSum(IsNaNOrInf(q_d * k_d, 0.0f), 1, temp) / sqrt((float)dimension);
const float p = IsNaNOrInf(exp(clamp(s - lse, -120.0f, 0.0f)), 0.0f);
if(p == 0.0f)
continue;
const float dp = LocalSum(IsNaNOrInf(g_d * v_d, 0.0f), 1, temp);
const float ds = IsNaNOrInf(p * (dp - D), 0.0f);
grad_k += IsNaNOrInf(q_d * ds, 0.0f);
grad_v += IsNaNOrInf(p * g_d, 0.0f);
}
key_value_gr[shift_k] = IsNaNOrInf(grad_k, 0.0f);
key_value_gr[shift_v] = IsNaNOrInf(grad_v, 0.0f);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFlashSTCA(__global const float *query,
__global const float *X,
__global float *logsumexp,
__global float *output,
const int dimension,
const int total_X,
const int mask_future
)
{
//--- init
const int q_id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_q = get_global_size(0);
const int total_loc = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
__local float4 temp4[LOCAL_ARRAY_SIZE];
//---
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
float prev_max = MIN_VALUE;
float sumexp = 0;
float out = 0;
for(int id = 0; id < total_X; id += total_loc)
{
int x_id = id + local_id;
const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0);
//--- Score
float score = 0;
if(x_id < total_X && (mask_future == 0 || q_id <= x_id))
{
for(int d = 0; d < dimension; d += 4)
{
float4 q = IsNaNOrInf4((float4)(
(d < dimension ? query[shift_q + d] : 0.0f),
((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f)
), 0.0f);
float4 k = IsNaNOrInf4((float4)(
(d < dimension ? X[shift_x + d] : 0.0f),
((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f),
((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f),
((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f)
), 0.0f);
score += IsNaNOrInf(dot(q, k), 0.0f);
}
score /= sqrt((float)dimension);
}
else
score = MIN_VALUE;
//--- norm score
float max = fmax(prev_max, LocalMax(score, 1, temp));
if(score > MIN_VALUE)
score = exp(score - max);
else
score = 0.0f;
if(sumexp == 0.0f)
sumexp = LocalSum(score, 1, temp);
else
sumexp = IsNaNOrInf(exp(prev_max - max) * sumexp, 0.0f) + LocalSum(score, 1, temp);
for(int d = 0; d < dimension; d += 4)
{
float4 val = (float4)0.0f;
if(score > 0.0f && x_id < total_X)
{
float4 v = IsNaNOrInf4((float4)(
(d < dimension ? X[shift_x + d] : 0.0f),
((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f),
((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f),
((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f)
), 0.0f);
val = IsNaNOrInf4(v * score, 0.0f);
}
val = LocalSum4(val, 1, temp4);
float add = 0.0f;
int idx = local_id - d;
if(idx >= 0 && idx < 4)
{
if(out != 0.0f)
out = IsNaNOrInf(exp(prev_max - max) * out + val[idx], 0.0f);
else
out = val[idx];
}
}
prev_max = max;
}
if(local_id < dimension)
{
if(sumexp > 0.0f)
output[shift_q + local_id] = IsNaNOrInf(out / sumexp, 0.0f);
else
output[shift_q + local_id] = 0.0f;
}
if(local_id == 0)
{
int shift_logse = RCtoFlat(q_id, h_id, total_q, total_heads, 0);
if(sumexp > 0.0f)
logsumexp[shift_logse] = IsNaNOrInf(prev_max + log(sumexp), 0.0f);
else
logsumexp[shift_logse] = 0.0f;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHFlashSTCAGrad(__global const float *query,
__global float *query_gr,
__global const float *X,
__global float *X_gr,
__global const float *logsumexp,
__global const float *output,
__global const float *output_gr,
const int dimension,
const int total_q,
const int total_X,
const int mask_future
)
{
const int id = get_global_id(0);
const int local_id = get_local_id(1);
const int h_id = get_global_id(2);
const int total_loc = get_local_size(1);
const int total_heads = get_global_size(2);
//---
__local float temp[LOCAL_ARRAY_SIZE];
__local float4 temp4[LOCAL_ARRAY_SIZE];
//--- Query gradient: dQ[q, d]
if(id < total_q)
{
float grad_q = 0.0f;
const int q_id = id;
const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id);
const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0);
const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f);
float D = 0;
for(int d = 0; d < dimension; d += 4)
{
float4 g_d = IsNaNOrInf4((float4)(
(d < dimension ? output_gr[shift_q + d] : 0.0f),
((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f)
), 0.0f);
float4 o_d = IsNaNOrInf4((float4)(
(d < dimension ? output[shift_q + d] : 0.0f),
((d + 1) < dimension ? output[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output[shift_q + d + 3] : 0.0f)
), 0.0f);
D += IsNaNOrInf(dot(g_d, o_d), 0.0f);
}
for(int l_id = 0; l_id < total_X; l_id += total_loc)
{
int x_id = l_id + local_id;
float ds = 0;
if(x_id < total_X && (mask_future == 0 || q_id <= x_id))
{
const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0);
float score = 0;
float dp = 0;
for(int d = 0; d < dimension; d += 4)
{
float4 q_d = IsNaNOrInf4((float4)(
(d < dimension ? query[shift_q + d] : 0.0f),
((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f)
), 0.0f);
float4 x_d = IsNaNOrInf4((float4)(
(d < dimension ? X[shift_x + d] : 0.0f),
((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f),
((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f),
((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f)
), 0.0f);
score += IsNaNOrInf(dot(q_d, x_d), 0.0f);
float4 g_d = IsNaNOrInf4((float4)(
(d < dimension ? output_gr[shift_q + d] : 0.0f),
((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f)
), 0.0f);
dp += IsNaNOrInf(dot(g_d, x_d), 0.0f);
}
score /= sqrt((float)dimension);
const float p = IsNaNOrInf(exp(clamp(score - lse, -120.0f, 0.0f)), 0.0f);
ds = IsNaNOrInf(p * (dp - D), 0.0f);
}
for(int d = 0; d < dimension; d += 4)
{
float4 x_d = (float4)0;
if(x_id < total_X && (mask_future == 0 || q_id <= x_id))
{
const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0);
x_d = IsNaNOrInf4((float4)(
(d < dimension ? X[shift_x + d] : 0.0f),
((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f),
((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f),
((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f)
), 0.0f);
}
float4 q_dg = LocalSum4(x_d * ds, 1, temp4);
int idx = local_id - d;
if(idx >= 0 && idx < 4)
grad_q += q_dg[idx];
}
}
if(local_id < dimension)
query_gr[shift_q + local_id] = IsNaNOrInf(grad_q, 0.0f);
}
//--- X gradients: dX[k, d]
if(id < total_X && h_id == 0)
{
float grad_X = 0.0f;
const int x_id = id;
const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0);
for(int l_id = 0; l_id < total_q * total_heads; l_id += total_loc)
{
int loc = l_id + local_id;
int h = loc / total_q;
int q_id = loc % total_q;
float ds = 0;
float p = 0;
if(h < total_heads && q_id < total_q &&
(mask_future == 0 || q_id <= x_id))
{
const int shift_lse = RCtoFlat(q_id, h, total_q, total_heads, 0);
const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f);
const int shift_q = RCtoFlat(h, 0, total_heads, dimension, q_id);
float score = 0;
float D = 0;
float dp = 0;
for(int d = 0; d < dimension; d += 4)
{
float4 q_d = IsNaNOrInf4((float4)(
(d < dimension ? query[shift_q + d] : 0.0f),
((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f)
), 0.0f);
float4 x_d = IsNaNOrInf4((float4)(
(d < dimension ? X[shift_x + d] : 0.0f),
((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f),
((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f),
((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f)
), 0.0f);
score += IsNaNOrInf(dot(q_d, x_d), 0.0f);
float4 g_d = IsNaNOrInf4((float4)(
(d < dimension ? output_gr[shift_q + d] : 0.0f),
((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f)
), 0.0f);
dp += IsNaNOrInf(dot(g_d, x_d), 0.0f);
float4 o_d = IsNaNOrInf4((float4)(
(d < dimension ? output[shift_q + d] : 0.0f),
((d + 1) < dimension ? output[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output[shift_q + d + 3] : 0.0f)
), 0.0f);
D += IsNaNOrInf(dot(g_d, o_d), 0.0f);
}
const float p = IsNaNOrInf(exp(clamp(score - lse, -120.0f, 0.0f)), 0.0f);
if(p != 0.0f)
ds = IsNaNOrInf(p * (dp - D), 0.0f);
}
//---
for(int d = 0; d < dimension; d += 4)
{
float4 q_d = (float4)0;
float4 g_d = (float4)0;
if(h < total_heads && q_id < total_q &&
(mask_future == 0 || q_id <= x_id))
{
const int shift_q = RCtoFlat(h, 0, total_heads, dimension, q_id);
q_d = IsNaNOrInf4((float4)(
(d < dimension ? query[shift_q + d] : 0.0f),
((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f)
), 0.0f);
g_d = IsNaNOrInf4((float4)(
(d < dimension ? output_gr[shift_q + d] : 0.0f),
((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f),
((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f),
((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f)
), 0.0f);
}
float4 x_dg = LocalSum4(q_d * ds + g_d * p, 1, temp4);
int idx = local_id - d;
if(idx >= 0 && idx < 4)
grad_X += x_dg[idx];
}
}
if(local_id < dimension)
X_gr[shift_x + local_id] = IsNaNOrInf(grad_X, 0.0f);
}
}
//+------------------------------------------------------------------+