/// \file /// \brief NeuroNet.cl /// Library consist OpenCL kernels /// \author DNG /// \copyright Copyright 2019, DNG //--- //--- by default some GPU doesn't support floats //--- cl_khr_fp64 directive is used to enable work with floats // #pragma OPENCL EXTENSION cl_khr_fp64 : enable #define l1 1.0e-4f #define l2 1.0e-4f #define MAX_GRAD 1.0e-2f #define LOCAL_ARRAY_SIZE 64 #define MAX_VALUE 3.4e37f #define MIN_VALUE -MAX_VALUE //--- Activation Functions #define ActFunc_None -1 #define ActFunc_TANH 0 #define ActFunc_SIGMOID 1 #define ActFunc_LReLU 2 #define ActFunc_SoftPlus 3 #define ActFunc_GELU 4 #define ActFunc_MinusSoftPlus 5 #define ActFunc_ELU 6 //--- #define BarrierLoc barrier(CLK_LOCAL_MEM_FENCE); //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float IsNaNOrInf(const float value, const float def_value) { if(isnan(value) || isinf(value)) return def_value; return value; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 IsNaNOrInf2(const float2 value, const float2 def_value) { if(isnan(value.x) || isinf(value.x) || isnan(value.y) || isinf(value.y)) return def_value; return value; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float4 IsNaNOrInf4(const float4 value, const float def_value) { float4 result; result.s0 = IsNaNOrInf(value.s0, def_value); result.s1 = IsNaNOrInf(value.s1, def_value); result.s2 = IsNaNOrInf(value.s2, def_value); result.s3 = IsNaNOrInf(value.s3, def_value); return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float fActivation(const float value, const int function) { float result = IsNaNOrInf(value, 0); switch(function) { case ActFunc_TANH: result = tanh(clamp(result, -20.0f, 20.0f)); break; case ActFunc_SIGMOID: //Sigmoid result = 1 / (1 + exp(clamp(-result, -20.0f, 20.0f))); break; case ActFunc_LReLU: //LReLU if(result < 0) result *= 0.01f; break; case ActFunc_SoftPlus: //SoftPlus result = (result >= 20.0f ? result : IsNaNOrInf(log(1 + exp(result)), 0)); break; case ActFunc_GELU: //GELU result = result / (1 + exp(clamp(-1.702f * result, -20.0f, 20.0f))); break; case ActFunc_MinusSoftPlus: // -SoftPlus result = -fActivation(result, 3); break; case ActFunc_ELU: //ELU if(result < 0) result = IsNaNOrInf(exp(result), 0) - 1; break; default: break; } //--- return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float Deactivation(const float grad, const float inp_value, const int function) { float result = IsNaNOrInf(grad, 0); //--- if(isnan(inp_value) || isinf(inp_value)) result = 0; else switch(function) { case ActFunc_TANH: //TANH result = clamp(grad + inp_value, -1.0f, 1.0f) - inp_value; result *= 1.0f - inp_value * inp_value; break; case ActFunc_SIGMOID: //Sigmoid result = clamp(grad + inp_value, 0.0f, 1.0f) - inp_value; result *= inp_value * (1.0f - inp_value); break; case ActFunc_LReLU: //LReLU if(inp_value < 0) result *= 0.01f; break; case ActFunc_SoftPlus: //SoftPlus result *= (1.0f - exp(-inp_value)); break; case ActFunc_GELU: //GELU if(inp_value < 0.9f) result *= fActivation(5 * inp_value, 1); break; case ActFunc_MinusSoftPlus: // -SoftPlus result = Deactivation(-result, -inp_value, ActFunc_SoftPlus); break; case ActFunc_ELU: //ELU if(inp_value < 0) result *= inp_value + 1; break; default: break; } //--- return clamp(IsNaNOrInf(result, 0), -MAX_GRAD, MAX_GRAD); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline int RCtoFlat(const int row, const int col, const int total_rows, const int total_cols, const int variable) { return (variable * total_rows + row) * total_cols + col; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float LocalMax(const float value, const int loc, __local float* Temp) { const size_t id = get_local_id(loc); const size_t total = get_local_size(loc); //--- const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE); float val = IsNaNOrInf(value, MIN_VALUE); //--- Look Max if(id < ls) Temp[id] = val; BarrierLoc for(int d = ls; d < total; d += ls) { if(id >= d && id < (d + ls) && (Temp[id - d] < val)) Temp[id - d] = val; BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(id < count && (id + count) < ls && Temp[id] < Temp[id + count]) Temp[id] = Temp[id + count]; BarrierLoc } while(count > 1); //--- return Temp[0]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float LocalMin(const float value, const int loc, __local float* Temp) { const size_t id = get_local_id(loc); const size_t total = get_local_size(loc); //--- const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE); float val = IsNaNOrInf(value, MAX_VALUE); //--- Look Min if(id < ls) Temp[id] = val; BarrierLoc for(int d = ls; d < total; d += ls) { if(id >= d && id < (d + ls) && (Temp[id - d] > val)) Temp[id - d] = val; BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(id < count && (id + count) < ls && Temp[id] > Temp[id + count]) Temp[id] = Temp[id + count]; BarrierLoc } while(count > 1); //--- return Temp[0]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float LocalSum(const float value, const int loc, __local float* Temp) { const size_t id = get_local_id(loc); const size_t total = get_local_size(loc); //--- if(total <= 1) return IsNaNOrInf(value, 0.0f); //--- const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE); //--- Sum float result = IsNaNOrInf(value, 0); if(id < ls) Temp[id] = result; BarrierLoc for(int d = ls; d < total; d += ls) { if(id >= d && id < (d + ls)) Temp[id - d] = Temp[id - d] + result; BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(id < count && (id + count) < ls) { Temp[id] += Temp[id + count]; Temp[id + count] = 0; } BarrierLoc } while(count > 1); result = IsNaNOrInf(Temp[0], 0); //--- return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float4 LocalSum4(const float4 value, const int loc, __local float4* Temp) { const size_t id = get_local_id(loc); const size_t total = get_local_size(loc); //--- if(total <= 1) return IsNaNOrInf4(value, 0.0f); //--- const uint ls = min((uint)total, (uint)LOCAL_ARRAY_SIZE); //--- float4 result = IsNaNOrInf4(value, 0.0f); if(id < ls) Temp[id] = result; BarrierLoc for(int d = ls; d < total; d += ls) { if(id >= d && id < (d + ls)) Temp[id - d] = Temp[id - d] + result; BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(id < count && (id + count) < ls) { Temp[id] += Temp[id + count]; Temp[id + count] = (float4)0.0f; } BarrierLoc } while(count > 1); //--- result = IsNaNOrInf4(Temp[0], 0.0f); //--- return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float LocalSoftMax(const float value, const int loc, __local float* Temp) { //--- Look Max float max = LocalMax(value, loc, Temp); if(max == MIN_VALUE) return 0.0f; //--- SoftMax float result = (value == MIN_VALUE ? 0.0f : IsNaNOrInf(exp(value - max), 0.0f)); const float sum = LocalSum(result, loc, Temp); if(sum == 0.0f) result = 0; else result = IsNaNOrInf(result / sum, 0.0f); //--- return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float LocalSoftMaxGrad(const float value, const float grad, const int loc, __local float* Temp) { const float y = IsNaNOrInf(value, 0.0f); const float g = IsNaNOrInf(grad, 0.0f); const float s = LocalSum(y * g, loc, Temp); //--- d_i = y_i * (g_i - sum_j(y_j * g_j)) return IsNaNOrInf(y * (g - s), 0.0f); } //+------------------------------------------------------------------+ ///\ingroup neuron_base_ff Feed forward process kernel /// Describes the forward path process for the Neuron Base (#CNeuronBaseOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void FeedForward(__global const float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number ///< of neurons in layer and n - number of outputs ///< (neurons in next layer) __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_o, ///<[out] Output tensor const int inputs, ///< Number of inputs const int activation ///< Activation type (#ENUM_ACTIVATION) ) { const int i = get_global_id(0); const int total_out = get_global_size(0); const int loc = get_local_id(1); const int total_loc = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- float sum = 0; float inp; int shift = RCtoFlat(i, 0, total_out, (inputs + 1), 0); for(int k = loc; k < inputs; k += total_loc) { inp = IsNaNOrInf(matrix_i[k], 0.0f); if(inp == 0.0f) continue; sum += IsNaNOrInf(inp * matrix_w[shift + k], 0.0f); } if(loc == 0) sum += IsNaNOrInf(matrix_w[shift + inputs], 0.0f); if(total_loc > 1) sum = LocalSum(sum, 1, Temp); //--- if(loc == 0) matrix_o[i] = fActivation(sum, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_base_gr Neuron Base Output Gradients Calculation kernel /// Describes the process of output gradients calculation for the Neuron Base /// (#CNeuronBaseOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void CalcOutputGradient(__global float *matrix_t, ///<[in] Target tensor __global float *matrix_o, ///<[in] Output tensor __global float *matrix_ig, ///<[out] Tensor of gradients int activation, ///< Activation type (#ENUM_ACTIVATION) float error) { int i = get_global_id(0); float out = matrix_o[i]; float temp = 0; if(!isnan(out) && !isinf(out)) temp = Deactivation(matrix_t[i] - out, out, activation); matrix_ig[i] = temp; } //+------------------------------------------------------------------+ ///\ingroup neuron_base_gr Neuron Base Hidden Gradients Calculation kernel /// Describes the process of hidden gradients calculation for the Neuron Base /// (#CNeuronBaseOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void CalcHiddenGradient(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number ///< of neurons in previous layer and n - number ///< of neurons in current layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_o, ///<[in] Previous layer Output tensor __global float *matrix_ig, ///<[out] Tensor of gradients at previous layer int outputs, ///< Number of outputs int activation ///< Activation type (#ENUM_ACTIVATION) ) { const int i = get_global_id(0); const int inputs = get_global_size(0); const int loc = get_local_id(1); const int total_loc = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- float sum = 0; float out = matrix_o[i]; float4 grad, weight; for(int k = 4 * loc; k < outputs; k += 4 * total_loc) { switch(outputs - k) { case 1: weight = (float4)(matrix_w[k * (inputs + 1) + i], 0, 0, 0); grad = (float4)(matrix_g[k], 0, 0, 0); break; case 2: grad = (float4)(matrix_g[k], matrix_g[k + 1], 0, 0); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], 0, 0); break; case 3: grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], 0); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], 0); break; default: grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], matrix_g[k + 3]); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], matrix_w[(k + 3) * (inputs + 1) + i]); break; } //--- weight = IsNaNOrInf4(weight, 0); grad = IsNaNOrInf4(grad, 0); //--- sum += dot(grad, weight); } if(total_loc > 1) sum = LocalSum(sum, 1, Temp); //--- matrix_ig[i] = Deactivation(sum, out, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_base_opt Neuron Base SGD Updating Weights Calculation kernel /// Describes the process of SGD optimization weights for the Neuron Base /// (#CNeuronBaseOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void UpdateWeightsMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction int inputs, ///< Number of inputs float learning_rates, ///< Learning rates float momentum ///< Momentum multiplier ) { int i = get_global_id(0); int j = get_global_id(1); int wi = i * (inputs + 1) + j; float grad = clamp(matrix_g[i], -MAX_GRAD, MAX_GRAD); float delta = IsNaNOrInf(learning_rates * grad * (j < inputs ? matrix_i[j] : 1), 0) + IsNaNOrInf(momentum * matrix_dw[wi], 0); matrix_dw[wi] = delta; if(fabs(delta) > 0) matrix_w[wi] = IsNaNOrInf(matrix_w[wi] + delta, 0); } //+------------------------------------------------------------------+ ///\ingroup neuron_base_opt Neuron Base Adam Updating Weights Calculation /// kernel /// Describes the process of Adam optimization weights for the Neuron Base /// (#CNeuronBaseOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int i = get_global_id(0); const int j = get_global_id(1); const int wi = i * (inputs + 1) + j; float m, v, weight, inp; inp = IsNaNOrInf((j == inputs ? 1.0f : matrix_i[j]), 0); weight = IsNaNOrInf(matrix_w[wi], 0); m = IsNaNOrInf(matrix_m[wi], 0); v = IsNaNOrInf(matrix_v[wi], 0); //--- float g = clamp(IsNaNOrInf(matrix_g[i] * inp, 0), -MAX_GRAD, MAX_GRAD); float mt = IsNaNOrInf(b1 * m + (1 - b1) * g, 0); float vt = IsNaNOrInf(b2 * v + (1 - b2) * (g * g), 0); float delta = IsNaNOrInf(l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)), 0); if(fabs(delta) > 0) matrix_w[wi] = IsNaNOrInf(matrix_w[wi] + delta, 0); matrix_m[wi] = mt; matrix_v[wi] = vt; } //+------------------------------------------------------------------+ ///\ingroup neuron_base_opt Neuron Base Least Squares Updating Weights /// Calculation kernel /// Describes the process of Least Squares optimization weights for the Neuron /// Base (#CNeuronBaseOCL). //\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void UpdateWeightsLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_xg, ///<[in,out] Matrix of summ x*g __global float *matrix_xx, ///<[in,out] Matrix of summ x*x const int inputs, ///< Number of inputs const float l, ///< Learning rates const int update ///< Update flag ) { const int i = get_global_id(0); const int j = get_global_id(1); const int wi = i * (inputs + 1) + j * 4; float4 xg, xx, weight, inp; switch(inputs + 1 - j * 4) { case 0: inp = (float4)(1, 0, 0, 0); weight = (float4)(matrix_w[wi], 0, 0, 0); break; case 1: inp = (float4)(matrix_i[j * 4], 1, 0, 0); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0); break; case 2: inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], 1, 0); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0); break; case 3: inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], 1); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]); break; default: inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], matrix_i[j * 4 + 3]); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]); break; } xg = (float4)(matrix_g[i]) * inp; xx = inp * inp; switch(min(inputs + 1 - j * 4, 3)) { case 3: if(update) { matrix_w[wi + 3] = matrix_w[wi + 3] + l * (matrix_xg[wi + 3] + xg.s3) / (matrix_xx[wi + 3] + xx.s3 + 1.0e-37f); matrix_xg[wi + 3] = 0; matrix_xx[wi + 3] = 0; } else { matrix_xg[wi + 3] += xg.s3; matrix_xx[wi + 3] += xx.s3; } case 2: if(update) { matrix_w[wi + 2] = matrix_w[wi + 2] + l * (matrix_xg[wi + 2] + xg.s2) / (matrix_xx[wi + 2] + xx.s2 + 1.0e-37f); matrix_xg[wi + 2] = 0; matrix_xx[wi + 2] = 0; } else { matrix_xg[wi + 2] += xg.s2; matrix_xx[wi + 2] += xx.s2; } case 1: if(update) { matrix_w[wi + 1] = matrix_w[wi + 1] + l * (matrix_xg[wi + 1] + xg.s1) / (matrix_xx[wi + 1] + xx.s1 + 1.0e-37f); matrix_xg[wi + 1] = 0; matrix_xx[wi + 1] = 0; } else { matrix_xg[wi + 1] += xg.s1; matrix_xx[wi + 1] += xx.s1; } case 0: if(update) { matrix_w[wi] = matrix_w[wi] + l * (matrix_xg[wi] + xg.s0) / (matrix_xx[wi] + xx.s0 + 1.0e-37f); matrix_xg[wi] = 0; matrix_xx[wi] = 0; } else { matrix_xg[wi] += xg.s0; matrix_xx[wi] += xx.s0; } break; } } //+------------------------------------------------------------------+ ///\ingroup neuron_proof_ff /// Kernel of the Pooling neuron for Feed forward process (#CNeuronProofOCL) //+------------------------------------------------------------------+ __kernel void FeedForwardProof(__global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_o, ///<[out] Output tensor int inputs, ///< Number of inputs int window, ///< Size of input window int step ///< Step size ) { int i = get_global_id(0); int pos = i * step; float result = matrix_i[pos]; //--- for(int k = 1; k < window; k++) { int shift = k + pos; if(shift >= inputs) break; result = max(result, matrix_i[shift]); } matrix_o[i] = result; } //+------------------------------------------------------------------+ ///\ingroup neuron_proof_gr /// Kernel of the Pooling neuron to transfer gradient to previous layer /// (#CNeuronProofOCL) //+------------------------------------------------------------------+ __kernel void CalcInputGradientProof(__global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_o, ///<[in] Output tensor __global float *matrix_ig, ///<[out] Tensor of gradients at previous layer int outputs, ///< Number of outputs int window, ///< Size of input window int step ///< Step size ) { int i = get_global_id(0); float prev_gradient = 0; float value = matrix_i[i]; int start = i - window + step; start = (start - start % step) / step; int stop = (i - i % step) / step + 1; for(int out = max(0, start); out < min(outputs, stop); out++) { if(value == matrix_o[out]) prev_gradient += matrix_g[out]; } matrix_ig[i] = prev_gradient; } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_ff /// Kernel of the Convolution neuron for Feed forward process (#CNeuronConvOCL) //+------------------------------------------------------------------+ __kernel void FeedForwardConv(__global const float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_o, ///<[out] Output tensor const int inputs, ///< Number of inputs const int step, ///< Step size const int window_in, ///< Size of input window const int window_out, ///< Size of output window const int activation ///< Activation type (#ENUM_ACTIVATION) ) { const size_t i = get_global_id(0); const int out = get_global_id(1); const size_t v = get_global_id(2); const size_t outputs = get_global_size(0); //--- const int shift_out = window_out * i; const int shift_in = step * i; //--- const int shift_var_in = v * inputs; const int shift_var_out = v * window_out * outputs; const int shift_var_w = v * window_out * (window_in + 1); //--- float sum = 0; float inp; //--- int shift = (window_in + 1) * out; int stop = (window_in <= (inputs - shift_in) ? window_in : (inputs - shift_in)); for(int k = 0; k < stop; k ++) { inp = IsNaNOrInf(matrix_i[shift_var_in + shift_in + k], 0.0f); if(inp == 0.0f) continue; sum += IsNaNOrInf(inp * matrix_w[shift_var_w + shift + k], 0.0f); } sum += IsNaNOrInf(matrix_w[shift_var_w + shift + window_in], 0.0f); //--- matrix_o[shift_var_out + out + shift_out] = fActivation(sum, activation);; } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_gr /// Kernel of the Convolution neuron to transfer gradient /// to previous layer (#CNeuronConvOCL) //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientConv(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_o, ///<[in] Output tensor __global float *matrix_ig, ///<[out] Tensor of gradients at previous layer const int outputs, ///< Number of outputs const int step, ///< Step size const int window_in, ///< Size of input window const int window_out, ///< Size of output window const int activation, ///< Activation type (#ENUM_ACTIVATION) const int shift_out ///< Shift in output and gradient buffer ) { const size_t i = get_global_id(0); const size_t inputs = get_global_size(0); const size_t v = get_global_id(1); //--- const int shift_var_in = v * inputs; const int shift_var_out = v * outputs; const int shift_var_w = v * window_out * (window_in + 1); //--- float sum = 0; float out = matrix_o[shift_var_in + i]; const int w_start = i % step; const int start = max((int)((i - window_in + step) / step), 0); int stop = (w_start + step - 1) / step; stop = min((int)((i + step - 1) / step + 1), stop) + start; if(stop > (outputs / window_out)) stop = outputs / window_out; for(int h = 0; h < window_out; h ++) { for(int k = start; k < stop; k++) { int shift_g = k * window_out + h; int shift_w = (stop - k - 1) * step + i % step + h * (window_in + 1); if(shift_g >= outputs || shift_w >= (window_in + 1) * window_out) break; float grad = IsNaNOrInf(matrix_g[shift_out + shift_g + shift_var_out], 0.0f); if(fabs(grad) > 0.0f) sum += IsNaNOrInf(grad * matrix_w[shift_w + shift_var_w], 0.0f); } } //--- matrix_ig[shift_var_in + i] = Deactivation(sum, out, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_opt Convolution Neuron SGD optimization Updating Weights /// Calculation kernel /// Describes the process of SGD optimization weights for the Convolution Neuron /// (#CNeuronConvOCL). //+------------------------------------------------------------------+ __kernel void UpdateWeightsConvMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction int inputs, ///< Number of inputs float learning_rates, ///< Learning rates float momentum, ///< Momentum multiplier int window_in, ///< Size of input window int window_out, ///< Size of output window int step ///< Step size ) { const size_t i = get_global_id(0); //--- const int v = i / ((window_in + 1) * window_out); const int shift = i % (window_in + 1); const int shift_out = i / (window_in + 1) - v; const int total = (inputs - window_in + step - 1) / step; //--- const int shift_var_in = v * inputs; const int shift_var_out = v * total * window_out; //--- float grad = 0; //--- for(int t = 0; t < total; t++) { if(shift != window_in && (shift + t * window_in) >= inputs) break; grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] * (shift == window_in ? 1 : matrix_i[shift + t * step + shift_var_in]), 0.0f); } float delta = IsNaNOrInf(learning_rates * grad, 0) + momentum * matrix_dw[i]; if(!isnan(delta)) { matrix_dw[i] = delta; if(fabs(delta) > 0) matrix_w[i] = IsNaNOrInf(matrix_w[i] + delta, 0); } } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating /// Weights Calculation kernel /// Describes the process of Adam optimization weights for the Convolution /// Neuron (#CNeuronConvOCL). //+------------------------------------------------------------------+ __kernel void UpdateWeightsConvAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in] Matrix of first momentum __global float *matrix_v, ///<[in] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2, ///< Second momentum multiplier int window_in, ///< Size of input window int window_out, ///< Size of output window int step ///< Step size ) { const size_t i = get_global_id(0); //--- const int v = i / ((window_in + 1) * window_out); const int shift = i % (window_in + 1); const int shift_out = i / (window_in + 1) - v * window_out; const int total = (inputs - (window_in - step) + (step - 1)) / step; //--- const int shift_var_in = v * inputs; const int shift_var_out = v * total * window_out; //--- float grad = 0; //--- for(int t = 0; t < total; t++) { if(shift != window_in && (shift + t * window_in) >= inputs) break; grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] * (shift == window_in ? 1 : matrix_i[shift + t * step + shift_var_in]), 0); } grad = clamp(IsNaNOrInf(grad, 0), -MAX_GRAD, MAX_GRAD); float mt = IsNaNOrInf(b1 * matrix_m[i] + (1 - b1) * grad, 0); float vt = IsNaNOrInf(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0f); float weight = IsNaNOrInf(matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0), 0); matrix_w[i] = weight; matrix_m[i] = mt; matrix_v[i] = vt; } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_opt Convolution Neuron Least Squares optimization /// Updating Weights Calculation kernel /// Describes the process of Least Squares optimization weights for the /// Convolution Neuron (#CNeuronConvOCL). //+------------------------------------------------------------------+ __kernel void UpdateWeightsConvLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_xg, ///<[in] Matrix of summ x*g __global float *matrix_xx, ///<[in] Matrix of summ x*x const int inputs, ///< Number of inputs const float l, ///< Learning rates const int update, ///< Update flag int window_in, ///< Size of input window int window_out, ///< Size of output window int step ///< Step size ) { const int i = get_global_id(0); if(i > window_in) return; //--- int total = (inputs - (window_in - step)) % step; total = (inputs - (window_in - step) - total) / step + (total > 0 ? 1 : 0); //--- for(int out = 0; out < window_out; out++) { if((window_out - out) > 4) { float4 xg = {0, 0, 0, 0}; float x2 = 0; int shift_w = i + out * (window_in + 1); for(int t = 0; t < total; t++) { if(i != window_in && (i + t * window_in) >= inputs) break; xg += (float4)(matrix_g[t * window_out + out], matrix_g[t * window_out + out + 1], matrix_g[t * window_out + out + 2], matrix_g[t * window_out + out + 3]) * (i == window_in ? 1 : matrix_i[i + t * step]); x2 += (i == window_in ? 1 : matrix_i[i + t * step] * matrix_i[i + t * step]); } if(update) { xg = (float4)(matrix_xg[shift_w], matrix_xg[shift_w + window_in + 1], matrix_xg[shift_w + 2 * (window_in + 1)], matrix_xg[shift_w + 3 * (window_in + 1)]) + xg; float4 xx = (float4)(matrix_xx[shift_w], matrix_xx[shift_w + window_in + 1], matrix_xx[shift_w + 2 * (window_in + 1)], matrix_xx[shift_w + 3 * (window_in + 1)]) + x2; float4 delta = l * xg / (xx + 1.0e-37f); float4 weight = (float4)(matrix_w[shift_w], matrix_w[shift_w + (window_in + 1)], matrix_w[shift_w + 2 * (window_in + 1)], matrix_w[shift_w + 3 * (window_in + 1)]) + delta; matrix_w[shift_w] = weight.s0; matrix_w[shift_w + (window_in + 1)] = weight.s1; matrix_w[shift_w + 2 * (window_in + 1)] = weight.s2; matrix_w[shift_w + 3 * (window_in + 1)] = weight.s3; matrix_xg[shift_w] = 0; matrix_xg[shift_w + (window_in + 1)] = 0; matrix_xg[shift_w + 2 * (window_in + 1)] = 0; matrix_xg[shift_w + 3 * (window_in + 1)] = 0; matrix_xx[shift_w] = 0; matrix_xx[shift_w + (window_in + 1)] = 0; matrix_xx[shift_w + 2 * (window_in + 1)] = 0; matrix_xx[shift_w + 3 * (window_in + 1)] = 0; } else { matrix_xg[shift_w] += xg.s0; matrix_xg[shift_w + (window_in + 1)] += xg.s1; matrix_xg[shift_w + 2 * (window_in + 1)] += xg.s2; matrix_xg[shift_w + 3 * (window_in + 1)] += xg.s3; matrix_xx[shift_w] = matrix_xx[shift_w + (window_in + 1)] = matrix_xx[shift_w + 2 * (window_in + 1)] = matrix_xx[shift_w + 3 * (window_in + 1)] += x2; } out += 3; } else { float xg = 0; float xx = 0; int shift_w = i + out * (window_in + 1); for(int t = 0; t < total; t++) { if(i != window_in && (i + t * window_in) >= inputs) break; xg += matrix_g[t * window_out + out] * (i == window_in ? 1 : matrix_i[i + t * step]); xx += (i == window_in ? 1 : matrix_i[i + t * step] * matrix_i[i + t * step]); } if(update) { xg = matrix_xg[shift_w] + xg; xx = matrix_xx[shift_w] + xx; float delta = l * xg / (xx + 1.0e-37f); matrix_w[shift_w] = matrix_w[shift_w] + delta; matrix_xg[shift_w] = 0; matrix_xx[shift_w] = 0; } else { matrix_xg[shift_w] += xg; matrix_xx[shift_w] += xx; } } } } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Attention Neuron Score calculation kernel | /// Describes the Score calculation process for the Neuron of attention layer /// (#CNeuronAttentionOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void AttentionScore(__global float *querys, ///<[in] Matrix of Querys __global float *keys, ///<[in] Matrix of Keys __global float *score, ///<[out] Matrix of Scores int dimension, ///< Dimension of Key int mask ///< 1 - calc only previous units, 0 - calc all ) { int q = get_global_id(0); int shift_q = q * dimension; int units = get_global_size(0); int shift_s = q * units; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; float sum = 0; //--- for(int k = 0; k < units; k++) { if(mask > 0 && k > q) { score[shift_s + k] = 0; continue; } float result = 0; int shift_k = k * dimension; for(int i = 0; i < dimension; i++) result += (querys[shift_q + i] * keys[shift_k + i]); result = IsNaNOrInf(exp(result / koef), 0); score[shift_s + k] = result; sum += result; } //--- for(int k = 0; (k < units && sum > 0); k++) score[shift_s + k] /= sum; } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Attention Neuron Out calculation kernel /// Describes the Attention out calculation process for the Neuron of attention /// layer (#CNeuronAttentionOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void AttentionOut(__global float *scores, ///<[in] Matrix of Scores __global float *values, ///<[in] Matrix of Values __global float *inputs, ///<[in] Inputs tensor __global float *out ///<[out] Output tensor ) { int units = get_global_size(0); int u = get_global_id(0); int d = get_global_id(1); int dimension = get_global_size(1); int shift = u * dimension + d; float result = 0; //--- for(int i = 0; i < units; i++) result += IsNaNOrInf(scores[u * units + i], 0) * IsNaNOrInf(values[i * dimension + d], 0); out[shift] = IsNaNOrInf(result, 0) + inputs[shift]; } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Kernel for calculation Sum of 2 matrixs with /// multiplyer. /// Describes the calculation Sum of 2 matrixs. ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void SumMatrix(__global float *matrix1, ///<[in] First matrix __global float *matrix2, ///<[in] Second matrix __global float *matrix_out, ///<[out] Output matrix int dimension, ///< Dimension of matrix float multiplyer, ///< Multiplyer for output int shift_in1, ///< Shift for input 1 int shift_in2, ///< Shift for input 2 int shift_out ///< Shift for output ) { const int i = get_global_id(0); const int step = get_global_size(0); //--- for(int k = 0; k < dimension; k++) { int index = i * dimension + k; matrix_out[i * shift_out + index] = IsNaNOrInf((matrix1[i * shift_in1 + index] + matrix2[i * shift_in2 + index]) * multiplyer, 0); } } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Kernel for calculation Sum of 4 matrixs with /// multiplyer. /// Describes the calculation Sum of 4 matrixs. ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void Sum5Matrix(__global float *matrix1, ///<[in] First matrix __global float *matrix2, ///<[in] Second matrix __global float *matrix3, ///<[in] Third matrix __global float *matrix4, ///<[in] Fourth matrix __global float *matrix5, ///<[in] Fifth matrix __global float *matrix_out, ///<[out] Output matrix int dimension, ///< Dimension of matrix float multiplyer ///< Multiplyer for output ) { const int i = get_global_id(0) * dimension; //--- for(int k = 0; k < dimension; k++) matrix_out[i + k] = (matrix1[i + k] + matrix2[i + k] + matrix3[i + k] + matrix4[i + k] + matrix5[i + k]) * multiplyer; } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation /// kernel /// Describes the gradients calculation process for the Neuron of attention /// layer (#CNeuronAttentionOCL). ///\details Detailed description on the link. /// @param[in] querys Matrix of Querys /// @param[out] querys_g Matrix of Querys' Gradients /// @param[in] keys Matrix of Keys /// @param[out] keys_g Matrix of Keys' Gradients /// @param[in] values Matrix of Values /// @param[out] values_g Matrix of Values' Gradients /// @param[in] scores Matrix of Scores /// @param[in] gradient Matrix of Gradients from previous iteration //+------------------------------------------------------------------+ __kernel void AttentionInsideGradients(__global float *querys, __global float *querys_g, __global float *keys, __global float *keys_g, __global float *values, __global float *values_g, __global float *scores, __global float *gradient) { int u = get_global_id(0); int d = get_global_id(1); int units = get_global_size(0); int dimension = get_global_size(1); float koef = sqrt((float)dimension); if(koef < 1) koef = 1; float vg = 0; float qg = 0; float kg = 0; //--- for(int iu = 0; iu < units; iu++) { float g = gradient[iu * dimension + d]; float sc = scores[iu * units + u]; vg += sc * g; //--- float sqg = 0; float skg = 0; for(int id = 0; id < dimension; id++) { sqg += values[iu * dimension + id] * gradient[u * dimension + id]; skg += values[u * dimension + id] * gradient[iu * dimension + id]; } qg += (scores[u * units + iu] == 0 || scores[u * units + iu] == 1 ? 0.0001f : scores[u * units + iu] * (1 - scores[u * units + iu])) * sqg * keys[iu * dimension + d] / koef; //--- kg += (scores[iu * units + u] == 0 || scores[iu * units + u] == 1 ? 0.0001f : scores[iu * units + u] * (1 - scores[iu * units + u])) * skg * querys[iu * dimension + d] / koef; } int shift = u * dimension + d; values_g[shift] = clamp(IsNaNOrInf(vg, 0.0f), -1.0f, 1.0f); querys_g[shift] = clamp(IsNaNOrInf(qg, 0.0f), -1.0f, 1.0f); keys_g[shift] = clamp(IsNaNOrInf(kg, 0.0f), -1.0f, 1.0f); } //+------------------------------------------------------------------+ ///\ingroup neuron_norm Kernels of matrix normalization process /// Describes the process of matrix normalization. ///\details Detailed description on the link. /// @param[in,out] buffer In/Out Matrix /// @param[in] dimension Dimension of matrix //+------------------------------------------------------------------+ __kernel void Normalize(__global float *buffer, int dimension) { int n = get_global_id(0); int shift = n * dimension; if(dimension < 1) return; //--- float mean = 0; float M2 = 0; float variance = 0; //--- for(int i = 0; i < dimension; i++) { float val = IsNaNOrInf(buffer[shift + i], 0); double delta = val - mean; mean += delta / (i + 1); M2 += delta * (val - mean); } variance = M2 / (dimension - 1); //--- for(int i = 0; i < dimension; i++) if(variance > 1) buffer[shift + i] = IsNaNOrInf((buffer[shift + i] - mean) / variance, 0); else buffer[shift + i] = IsNaNOrInf(buffer[shift + i] - mean, 0); } //+------------------------------------------------------------------+ ///\ingroup neuron_norm Kernels of weights matrix normalization process /// Describes the process of weights matrix normalization. ///\details Detailed description on the link. /// @param[in,out] buffer In/Out Matrix /// @param[in] dimension Dimension of matrix //+------------------------------------------------------------------+ __kernel void NormalizeWeights(__global float *buffer, int dimension) { int n = get_global_id(0); int shift = n * dimension; float sum = 0; float k = 1; //--- do { for(int i = 0; (i < dimension && !(isnan(sum) || !isinf(sum))); i++) { float normalized = IsNaNOrInf(buffer[shift + i], 0) / k; sum = normalized * normalized / dimension; } if(isnan(sum) || isinf(sum)) k *= 10; } while(isnan(sum) || isinf(sum)); sum = sqrt(sum); if(k * sum > 1) for(int i = 0; i < dimension; i++) buffer[shift + i] = IsNaNOrInf(buffer[shift + i], 0) / (k * sum); } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff /// Describes the process of concatenate 4 matrices. ///\details Detailed description on the link. /// @param[in] input1, input2, input3, input4 Input buffers /// @param[in] window1, window2, window3, window4 Windows for every buffer /// @param[out] output Output buffer //+------------------------------------------------------------------+ __kernel void ConcatenateBuffers(__global float *input1, int window1, __global float *input2, int window2, __global float *input3, int window3, __global float *input4, int window4, __global float *output) { int n = get_global_id(0); int shift = n * (window1 + window2 + window3 + window4); int shift_in = n * window1; for(int i = 0; i < window1; i++) output[shift + i] = IsNaNOrInf(input1[shift_in + i], 0); //--- shift += window1; shift_in = n * window2; //--- for(int i = 0; i < window2; i++) output[shift + i] = IsNaNOrInf(input2[shift_in + i], 0); //--- shift += window2; shift_in = n * window3; //--- for(int i = 0; i < window3; i++) output[shift + i] = IsNaNOrInf(input3[shift_in + i], 0); //--- shift += window3; shift_in = n * window4; //--- for(int i = 0; i < window4; i++) output[shift + i] = IsNaNOrInf(input4[shift_in + i], 0); } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_gr /// Describes the process of deconcatenate matrix. ///\details Detailed description on the link. /// @param[in] output1, output2, output3, output4 Output buffers /// @param[in] window1, window2, window3, window4 Windows for every buffer /// @param[out] inputs Input buffer //+------------------------------------------------------------------+ __kernel void DeconcatenateBuffers(__global float *output1, int window1, __global float *output2, int window2, __global float *output3, int window3, __global float *output4, int window4, __global float *inputs) { int n = get_global_id(0); //--- Head 1 int shift = n * (window1 + window2 + window3 + window4); int shift_out = n * window1; //--- for(int i = 0; i < window1; i++) output1[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0); //--- Head 2 shift += window1; shift_out = n * window2; //--- for(int i = 0; i < window2; i++) output2[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0); //--- Head 3 shift += window2; if(window3 > 0) { shift_out = n * window3; //--- for(int i = 0; i < window3; i++) output3[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0); } //--- Head 4 shift += window3; if(window4 > 0) { shift_out = n * window4; //--- for(int i = 0; i < window4; i++) output4[shift_out + i] = IsNaNOrInf(inputs[shift + i], 0); } } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Multi-Heads Attention Neuron Score calculation /// kernel /// Describes the Score calculation process for the Neuron of multi-heads /// attention layer (#CNeuronMLMHAttentionOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void MHAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values __global float *score, ///<[out] Matrix of Scores int dimension, ///< Dimension of Key int mask ///< 1 - calc only previous units, 0 - calc all ) { int q = get_global_id(0); int h = get_global_id(1); int units = get_global_size(0); int heads = get_global_size(1); //--- int shift_q = dimension * (h + 3 * q * heads); int shift_s = units * (h + q * heads); //--- float koef = sqrt((float)dimension); if(koef < 1) koef = 1; float sum = 0; //--- for(int k = 0; k < units; k++) { if(mask > 0 && k > q) { score[shift_s + k] = 0; continue; } float result = 0; int shift_k = dimension * (h + heads * (3 * k + 1)); for(int i = 0; i < dimension; i++) { if((dimension - i) > 4) { result += dot(IsNaNOrInf4((float4)(qkv[shift_q + i], qkv[shift_q + i + 1], qkv[shift_q + i + 2], qkv[shift_q + i + 3]), 0), IsNaNOrInf4((float4)(qkv[shift_k + i], qkv[shift_k + i + 1], qkv[shift_k + i + 2], qkv[shift_k + i + 3]), 0)); i += 3; } else result += IsNaNOrInf(qkv[shift_q + i] * qkv[shift_k + i], 0); } result = exp(clamp(result / koef, -100.0f, 100.0f)); if(isnan(result)) result = 0; score[shift_s + k] = result; sum += result; } //--- for(int k = 0; (k < units && sum > 1); k++) score[shift_s + k] /= sum; } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_ff Multi-heads Attention Neuron Out calculation kernel /// Describes the Multi-heads Attention out calculation process for the Neuron /// of multi-heads attention layer (#CNeuronMLMHAttentionOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void MHAttentionOut(__global float *scores, ///<[in] Matrix of Scores __global float *qkv, ///<[in] Matrix of Values __global float *out, ///<[out] Output tensor int dimension ///< Dimension of Value ) { int u = get_global_id(0); int units = get_global_size(0); int h = get_global_id(1); int heads = get_global_size(1); //--- int shift_s = units * (h + heads * u); int shift_out = dimension * (h + heads * u); int layer = 3 * dimension * heads; //--- //--- for(int d = 0; d < dimension; d++) { float result = 0; for(int v = 0; v < units; v++) { int shift_v = dimension * (h + heads * (3 * v + 2)) + d; result += scores[shift_s + v] * qkv[shift_v]; } out[shift_out + d] = result; } } //+------------------------------------------------------------------+ ///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation /// kernel /// Describes the gradients calculation process for the Neuron of attention /// layer (#CNeuronMLMHAttentionOCL). ///\details Detailed description on the link. /// @param[in] qkv Matrix of Querys, Keys and Values /// @param[out] qkv_g Matrix of Querys', Keys' and Values' Gradients /// @param[in] scores Matrix of Scores /// @param[in] scores_g Matrix of Scores' Gradients /// @param[in] gradient Matrix of Gradients from previous iteration /// @param[in] dimension Dimension of Key vector //+------------------------------------------------------------------+ __kernel void MHAttentionInsideGradients(__global float *qkv, __global float *qkv_g, __global float *scores, __global float *gradient) { size_t u = get_global_id(0); size_t h = get_global_id(1); size_t d = get_global_id(2); size_t units = get_global_size(0); size_t heads = get_global_size(1); size_t dimension = get_global_size(2); //--- float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- init const int shift_q = dimension * (heads * 3 * u + h); const int shift_k = dimension * (heads * (3 * u + 1) + h); const int shift_v = dimension * (heads * (3 * u + 2) + h); const int shift_g = dimension * (heads * u + h); int shift_score = h * units; int step_score = units * heads; //--- Calculating Value's gradients float sum = 0; //--- for(int i = 0; i < units; i++) sum += gradient[(h + i * heads) * dimension + d] * scores[shift_score + u + i * step_score]; qkv_g[shift_v + d] = sum; //--- Calculating Query's gradients shift_score = h * units + u * step_score; float grad = 0; float grad_out = gradient[shift_g + d]; //--- for(int k = 0; k < units; k++) { float sc_g = 0; float sc = scores[shift_score + k]; for(int v = 0; v < units; v++) sc_g += scores[shift_score + v] * qkv[dimension * (heads * (3 * v + 2) + h)] * grad_out * ((k == v) - sc); grad += sc_g / koef * qkv[dimension * (heads * (3 * k + 1) + h) + d]; } qkv_g[shift_q + d] = grad; //--- Calculating Key's gradients grad = 0; //--- for(int q = 0; q < units; q++) { shift_score = h * units + q * step_score; float sc_g = 0; float sc = scores[shift_score + u]; float grad_out = gradient[dimension * (heads * q + h) + d]; for(int v = 0; v < units; v++) sc_g += scores[shift_score + v] * qkv[dimension * (heads * (3 * v + 2) + h)] * grad_out * ((u == v) - sc); grad += sc_g / koef * qkv[dimension * (heads * 3 * q + h) + d]; } qkv_g[shift_k + d] = grad; } //+------------------------------------------------------------------+ ///\ingroup neuron_dropout Kernel for Dropout. /// Describes the dropout method. ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void Dropout(__global const float *inputs, ///<[in] Input matrix __global const float *map, ///<[in] Dropout map matrix __global float *out, ///<[out] Output matrix const int dimension ///< Dimension of matrix ) { const int i = get_global_id(0) * 4; float m = 0, inp = 0; //--- for(int k = i; k < min(dimension, i + 4); k++) { m = IsNaNOrInf(map[i + k], 0.0f); if(m == 0) { out[i + k] = 0; continue; } inp = IsNaNOrInf(inputs[i + k], 0.0f); if(inp == 0) { out[i + k] = 0; continue; } out[i + k] = IsNaNOrInf(inp * m, 0); } } //+------------------------------------------------------------------+ ///\ingroup neuron_norm Kernels of Batch normalization process /// Describes the process of Batch normalization. (#CNeuronBatchNormOCL) ///\details Detailed description on the link. /// @param[in] inputs Input data tenzor /// @param[in,out] options Tenzor of variables /// @param[out] output Tenzor of output data /// @param[in] batch Batch size /// @param[in] optimization Optimization type /// @param[in] activation Activation type //+------------------------------------------------------------------+ __kernel void BatchFeedForward(__global float *inputs, __global float *options, __global float *output, int batch, int optimization, int activation) { if(batch <= 1) return; int n = get_global_id(0); int shift = n * (optimization == 0 ? 7 : 9); //--- float inp = IsNaNOrInf(inputs[n], 0); float mean = (IsNaNOrInf(options[shift], 0) * max((float)batch - 1.0f, 0.0f) + inp) / max((float)batch, 1.0f); float delt = inp - mean; float variance = (IsNaNOrInf(options[shift + 1] * max((float)batch - 1.0f, 0.0f), 0) + delt * delt) / max((float)batch, 1.0f); if(variance <= 0) variance = 1; float nx = delt / sqrt(variance); //--- float gamma = IsNaNOrInf(options[shift + 3], 1); if(gamma == 0) { options[shift + 3] = 1; gamma = 1; } float betta = IsNaNOrInf(options[shift + 4], 0); //--- options[shift] = mean; options[shift + 1] = variance; options[shift + 2] = nx; output[n] = fActivation(gamma * nx + betta, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_gr /// Kernel of the Batch neuron to transfer gradient to previous layer /// (#CNeuronBatchNormOCL) ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientBatch(__global float *options, ///<[in] Options matrix m*(7 or 9), where m - Number of neurons in previous layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_i, ///<[in] Tensor of previous layer output __global float *matrix_ig, ///<[out] Tensor of gradients at previous layer int activation, ///< Activation type (#ENUM_ACTIVATION) int batch, ///< Batch size int optimization ///< Optimization type ) { if(batch <= 1) return; //--- int n = get_global_id(0); int shift = n * (optimization == 0 ? 7 : 9); //--- float variance = IsNaNOrInf(options[shift + 1], 1); //--- float inp = IsNaNOrInf(matrix_i[n], 0); float gnx = IsNaNOrInf(matrix_g[n], 0) * IsNaNOrInf(options[shift + 3], 1); float temp = (variance > 0 ? 1.0f / sqrt(variance) : 0); float gmu = (-temp) * gnx; float gvar = (variance > 0 ? (IsNaNOrInf(options[shift], 0) * inp) / (2 * pow(variance, 3.0f / 2.0f)) * gnx : 0); float batch_ratio = max((float)(batch - 1), 0.0f) / max((float)batch, 1.0f); float gx = temp * gnx + gmu / max(batch, 1) + gvar * 2 * inp / max(batch, 1) * batch_ratio * batch_ratio; //--- matrix_ig[n] = Deactivation(gx, inp, activation);; } //+------------------------------------------------------------------+ ///\ingroup neuron_opt Batch normalization Neuron SGD optimization Updating /// options kernel /// Describes the process of SGD optimization options for the Batch /// normalization Neuron (#CNeuronBatchNormOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void UpdateBatchOptionsMomentum(__global float *options, ///<[in,out] Options matrix m*7, where m - Number of neurons in previous layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer float learning_rates, ///< Learning rates float momentum ///< Momentum multiplier ) { const int n = get_global_id(0); const int inputs = get_global_size(0); const int shift = n * 7; float grad = clamp(IsNaNOrInf(matrix_g[n], 0), -MAX_GRAD, MAX_GRAD); //--- float2 delta = learning_rates * grad * (float2)(IsNaNOrInf(options[shift + 2], 0), 1) + momentum * (float2)(IsNaNOrInf(options[shift + 5], 0), IsNaNOrInf(options[shift + 6], 0)); //--- delta.s0 = IsNaNOrInf(delta.s0, 0); delta.s1 = IsNaNOrInf(delta.s1, 0); options[shift + 5] = delta.s0; float value = IsNaNOrInf(options[shift + 3], 1); options[shift + 3] = value + delta.s0 - learning_rates * (l1 * sign(value) + l2 * value / inputs); //--- options[shift + 6] = delta.s1; value = IsNaNOrInf(options[shift + 4], 0); options[shift + 4] = value + delta.s1 - learning_rates * (l1 * sign(value) + l2 * value / inputs); } //+------------------------------------------------------------------+ ///\ingroup neuron_opt Batch normalization Neuron Adam optimization Updating /// options kernel /// Describes the process of Adam optimization options for the Batch /// normalization Neuron (#CNeuronBatchNormOCL). ///\details Detailed description on the link. //+------------------------------------------------------------------+ __kernel void UpdateBatchOptionsAdam(__global float *options, ///<[in,out] Options matrix m*9, where m - Number of neurons in previous layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int n = get_global_id(0); int inputs = get_global_size(0); const int shift = n * 9; float grad = clamp(IsNaNOrInf(matrix_g[n], 0), -MAX_GRAD, MAX_GRAD); //--- float nx = IsNaNOrInf(options[shift + 2], 0); float gamma = IsNaNOrInf(options[shift + 3], 1); if(gamma == 0) gamma = 1; float betta = IsNaNOrInf(options[shift + 4], 0); //--- float gamma_m1 = IsNaNOrInf(options[shift + 5], 0); float betta_m1 = IsNaNOrInf(options[shift + 6], 0); float gamma_m2 = IsNaNOrInf(options[shift + 7], 0); float betta_m2 = IsNaNOrInf(options[shift + 8], 0); //--- float2 mt = b1 * (float2)(gamma_m1, betta_m1) + (1 - b1) * (float2)(grad * nx, grad); float2 grad2 = (float2)(grad * nx, grad); float2 vt = b2 * (float2)(gamma_m2, betta_m2) + (1 - b2) * (grad2 * grad2); vt.s0 = IsNaNOrInf(vt.s0, 1); vt.s1 = IsNaNOrInf(vt.s1, 1); float2 delta = l * mt / sqrt(vt); delta.s0 = IsNaNOrInf(delta.s0, 0); delta.s1 = IsNaNOrInf(delta.s0, 0); float2 weight = delta - (l1 * sign((float2)(gamma, betta)) + l2 * (float2)(gamma, betta) / inputs); //--- options[shift + 3] = IsNaNOrInf(gamma + weight.s0, 1); options[shift + 4] = IsNaNOrInf(betta + weight.s1, 0); options[shift + 5] = IsNaNOrInf(mt.s0, 0); options[shift + 6] = IsNaNOrInf(mt.s1, 0); options[shift + 7] = vt.s0; options[shift + 8] = vt.s1; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void VAE_FeedForward(__global float *inputs, __global float *random, __global float *outputs) { uint i = (uint)get_global_id(0); uint total = (uint)get_global_size(0); outputs[i] = IsNaNOrInf(inputs[i], 0) + IsNaNOrInf(exp(0.5f * inputs[i + total]), 0) * random[i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void VAE_CalcHiddenGradient(__global float *inputs, __global float *inp_grad, __global float *random, __global float *gradient, const float kld_mult) { uint i = (uint)get_global_id(0); uint total = (uint)get_global_size(0); float log_var = IsNaNOrInf(inputs[i + total], 0); float mean = IsNaNOrInf(inputs[i], 0); float kld = kld_mult * 0.5f * (log_var - exp(log_var) - mean * mean + 1); float grad = clamp(IsNaNOrInf(gradient[i], 0), -MAX_GRAD, MAX_GRAD); inp_grad[i] = IsNaNOrInf(grad / exp(0.5f * log_var) + kld * mean, 0); inp_grad[i + total] = IsNaNOrInf(0.5f * (grad * random[i] - kld * (1 - exp(log_var))), 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LSTM_FeedForward(__global const float *inputs, int inputs_size, __global const float *weights, __global float *concatenated, __global float *memory, __global float *output) { uint id = (uint)get_global_id(0); uint total = (uint)get_global_size(0); uint id2 = (uint)get_local_id(1); uint idv = (uint)get_global_id(2); uint total_v = (uint)get_global_size(2); //--- __local float Temp[4]; //--- float sum = 0; uint shift_in = idv * inputs_size; uint shift_out = idv * total; uint shift = (inputs_size + total + 1) * (id2 + id); //--- for(uint i = 0; i < total; i += 4) { if(total - i > 4) sum += IsNaNOrInf( dot((float4)(output[shift_out + i], output[shift_out + i + 1], output[shift_out + i + 2], output[shift_out + i + 3]), (float4)(weights[shift + i], weights[shift + i + 1], weights[shift + i + 2], weights[shift + i + 3])), 0); else for(uint k = i; k < total; k++) sum += IsNaNOrInf(output[shift_out + k] * weights[shift + k], 0); } //--- shift += total; //--- for(uint i = 0; i < inputs_size; i += 4) { if(total - i > 4) sum += IsNaNOrInf( dot((float4)(inputs[shift_in + i], inputs[shift_in + i + 1], inputs[shift_in + i + 2], inputs[shift_in + i + 3]), (float4)(weights[shift + i], weights[shift + i + 1], weights[shift + i + 2], weights[shift + i + 3])), 0); else for(uint k = i; k < total; k++) sum += IsNaNOrInf(inputs[shift_in + k] * weights[shift + k], 0); } sum += IsNaNOrInf(weights[shift + inputs_size], 0); if(id2 < 3) sum = fActivation(sum, 1); else sum = fActivation(sum, 0); Temp[id2] = sum; concatenated[4 * shift_out + id2 * total + id] = sum; //--- BarrierLoc if(id2 == 0) { float mem = memory[shift_out + id + total_v * total] = memory[shift_out + id]; float fg = Temp[0]; float ig = Temp[1]; float og = Temp[2]; float nc = Temp[3]; //--- memory[shift_out + id] = mem = IsNaNOrInf(mem * fg + ig * nc, 0); output[shift_out + id] = IsNaNOrInf(og * fActivation(mem, 0), 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LSTM_ConcatenatedGradient(__global float *gradient, __global float *concatenated_gradient, __global float *memory, __global float *concatenated) { uint id = (uint)get_global_id(0); uint total = (uint)get_global_size(0); uint idv = (uint)get_global_id(1); uint total_v = (uint)get_global_size(1); //--- uint shift_out = idv * total; float t = tanh(memory[shift_out + id]); //--- concatenated_gradient[4 * shift_out + id + 2 * total] = gradient[shift_out + id] * t; // output gate //--- float memory_gradient = gradient[shift_out + id] * concatenated[4 * shift_out + id + 2 * total]; memory_gradient *= 1 - t * t; //--- concatenated_gradient[4 * shift_out + id + 3 * total] = memory_gradient * concatenated[4 * shift_out + id + total]; // new content //--- concatenated_gradient[4 * shift_out + id + total] = memory_gradient * concatenated[4 * shift_out + id + 3 * total]; // input gate //--- concatenated_gradient[4 * shift_out + id] = memory_gradient * memory[shift_out + id + total_v * total]; // forgat gate } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LSTM_HiddenGradient(__global float *concatenated_gradient, __global float *inputs_gradient, __global float *weights_gradient, __global float *hidden_state, __global float *inputs, __global float *weights, __global float *output, const int hidden_size, const int inputs_size) { uint id = get_global_id(0); uint total = get_global_size(0); uint idv = (uint)get_global_id(1); uint total_v = (uint)get_global_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; uint ls = min(total_v, (uint)LOCAL_ARRAY_SIZE); //--- uint shift_in = idv * inputs_size; uint shift_out = idv * total; uint weights_step = hidden_size + inputs_size + 1; //--- //--- for(int i = id; i < (hidden_size + inputs_size); i += total) { float inp = 0; if(i < hidden_size) { inp = hidden_state[shift_out + i]; hidden_state[shift_out + i] = output[shift_out + i]; } else { inp = inputs[shift_in + i - hidden_size]; float grad = 0; for(uint g = 0; g < 3 * hidden_size; g++) { float temp = concatenated_gradient[4 * shift_out + g]; grad += temp * (1 - temp) * weights[i + g * weights_step]; } for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++) { float temp = concatenated_gradient[4 * shift_out + g]; grad += temp * (1 - temp * temp) * weights[i + g * weights_step]; } inputs_gradient[shift_in + i - hidden_size] = grad; } //--- for(uint g = 0; g < 3 * hidden_size; g++) { float temp = concatenated_gradient[4 * shift_out + g]; if(idv < ls) Temp[idv % ls] = 0; BarrierLoc for(uint v = 0; v < total_v; v += ls) { if(idv >= v && idv < v + ls) Temp[idv % ls] += temp * (1 - temp) * inp; BarrierLoc } if(idv == 0) { temp = Temp[0]; for(int v = 1; v < ls; v++) temp += Temp[v]; weights_gradient[i + g * weights_step] = temp; } BarrierLoc } for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++) { float temp = concatenated_gradient[4 * shift_out + g]; if(idv < ls) Temp[idv % ls] = 0; BarrierLoc for(uint v = 0; v < total_v; v += ls) { if(idv >= v && idv < v + ls) Temp[idv % ls] += temp * (1 - temp * temp) * inp; BarrierLoc } if(idv == 0) { temp = Temp[0]; for(int v = 1; v < ls; v++) temp += Temp[v]; weights_gradient[i + g * weights_step] = temp; } BarrierLoc } } //--- for(int i = id; i < 4 * hidden_size; i += total) { if(idv < ls) Temp[idv % ls] = 0; BarrierLoc float temp = concatenated_gradient[4 * shift_out + i]; if(i < 3 * hidden_size) { for(uint v = 0; v < total_v; v += ls) { if(idv >= v && idv < v + ls) Temp[idv % ls] += temp * (1 - temp); BarrierLoc } } else { for(uint v = 0; v < total_v; v += ls) { if(idv >= v && idv < v + ls) Temp[idv % ls] += 1 - temp * temp; BarrierLoc } } if(idv == 0) { temp = Temp[0]; for(int v = 1; v < ls; v++) temp += Temp[v]; weights_gradient[(i + 1) * weights_step] = temp; } BarrierLoc } } //+------------------------------------------------------------------+ ///\ingroup LSTM_opt LSTM Adam Updating Weights Calculation kernel /// Describes the process of Adam optimization weights for the Neuron LSTM /// (#CNeuronLSTMOCL). //+------------------------------------------------------------------+ __kernel void LSTM_UpdateWeightsAdam(__global float *weights, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global float *weights_gradient, ///<[in] Tensor of gradients at current layer __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const uint id = get_global_id(0); const uint total = get_global_size(0); const uint id1 = get_global_id(1); const uint wi = id1 * total + id; float g = clamp(IsNaNOrInf(weights_gradient[wi], 0), -MAX_GRAD, MAX_GRAD); float mt = b1 * IsNaNOrInf(matrix_m[wi], 0) + (1 - b1) * g; float vt = b2 * IsNaNOrInf(matrix_v[wi], 1) + (1 - b2) * (g * g); float weight = IsNaNOrInf(weights[wi], 0); float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight / total)); weights[wi] = IsNaNOrInf(weight + delta, 0); matrix_m[wi] = mt; matrix_v[wi] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SoftMax_FeedForward(__global float *inputs, __global float *outputs) { const uint total = (uint)get_local_size(0); const uint l = (uint)get_local_id(0); const uint h = (uint)get_global_id(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; uint shift_head = h * total; //--- outputs[shift_head + l] = LocalSoftMax(IsNaNOrInf(inputs[shift_head + l], MIN_VALUE), 0, Temp); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SoftMax_HiddenGradient(__global float *outputs, __global float *output_gr, __global float *input_gr) { size_t i = get_local_id(0); size_t outputs_total = get_local_size(0); size_t h = get_global_id(1); __local float Temp[LOCAL_ARRAY_SIZE]; //--- uint shift = h * outputs_total; float output = IsNaNOrInf(outputs[shift + i], 0); float grad = IsNaNOrInf(output_gr[shift + i], 0); input_gr[shift + i] = LocalSoftMaxGrad(output, grad, 0, Temp); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SoftMax_OutputGradient(__global float *outputs, __global float *targets, __global float *output_gr) { size_t i = get_global_id(0); output_gr[i] = (outputs[i] == 0 ? 0 : targets[i] / outputs[i]); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FQF_Cosine(__global float *softmax, __global float *output) { size_t i = get_global_id(0); size_t total = get_global_size(0); size_t action = get_global_id(1); int shift = action * total; //--- float result = 0; //--- for(int it = 0; it < i; it++) result += softmax[shift + it]; result += softmax[shift + i] / 2.0f; output[shift + i] = cos(i * M_PI_F * result); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FQF_Output(__global float *quantiles, __global float *delta_taus, __global float *output, int total) { size_t action = get_global_id(0); int shift = action * total; //--- float result = 0; //--- for(int i = 0; i < total; i++) result += quantiles[shift + i] * delta_taus[shift + i]; output[action] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FQF_OutputGradient(__global float *quantiles, __global float *delta_taus, __global float *output_gr, __global float *quantiles_gr, __global float *taus_gr) { size_t i = get_global_id(0); size_t total = get_global_size(0); size_t action = get_global_id(1); int shift = action * total; //--- float gradient = output_gr[action]; quantiles_gr[shift + i] = gradient * delta_taus[shift + i]; taus_gr[shift + i] = gradient * quantiles[shift + i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FQF_QuantileGradient(__global float *state_embeding, __global float *taus_embeding, __global float *quantiles_gr, __global float *state_gr, __global float *taus_gr) { size_t i = get_global_id(0); size_t total = get_global_size(0); size_t action = get_global_id(1); int shift = action * total; //--- float gradient = quantiles_gr[shift + i]; state_gr[shift + i] = gradient * taus_embeding[shift + i]; taus_gr[shift + i] = gradient * state_embeding[shift + i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FQF_CosineGradient(__global float *softmax, __global float *output_gr, __global float *softmax_gr) { size_t i = get_global_id(0); size_t total = get_global_size(0); size_t action = get_global_id(1); int shift = action * total; //--- float cumul = 0; //--- for(int it = 0; it < i; it++) cumul += softmax[shift + it]; float result = -M_PI_F * i * sin(M_PI_F * i * (cumul + softmax[shift + i] / 2)) * output_gr[shift + i]; //--- for(int it = i + 1; it < total; it++) { cumul += softmax[shift + it - 1]; float temp = cumul + softmax[shift + it] / 2; result += -M_PI_F * it * sin(M_PI_F * it * temp) * output_gr[shift + it] * softmax[shift + it] / temp; } softmax_gr[shift + i] += result; } //+------------------------------------------------------------------+ //| Sparse Attention | //+------------------------------------------------------------------+ __kernel void MHSparseAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values __global float *score, ///<[out] Matrix of Scores int dimension, ///< Dimension of Key float sparse ///< less than 1.0 coefficient of sparse ) { int q = get_global_id(0); int h = get_global_id(1); int units = get_global_size(0); int heads = get_global_size(1); //--- int shift_q = dimension * (h + 3 * q * heads); int shift_s = units * (h + q * heads); int active_units = (int)max((float)(units * sparse), min((float)units, 3.0f)); //--- float koef = sqrt((float)dimension); if(koef < 1) koef = 1; float sum = 0.0f; float min_s = 0.0f; float max_s = 0.0f; //--- for(int k = 0; k < units; k++) { float result = 0; int shift_k = dimension * (h + heads * (3 * k + 1)); for(int i = 0; i < dimension; i++) { if((dimension - i) > 4) { result += dot((float4)(qkv[shift_q + i], qkv[shift_q + i + 1], qkv[shift_q + i + 2], qkv[shift_q + i + 3]), (float4)(qkv[shift_k + i], qkv[shift_k + i + 1], qkv[shift_k + i + 2], qkv[shift_k + i + 3])); i += 3; } else result += (qkv[shift_q + i] * qkv[shift_k + i]); } score[shift_s + k] = result; if(k == 0) min_s = max_s = result; else { max_s = max(max_s, result); min_s = min(min_s, result); } } //--- int count = units; //--- while(count > active_units && min_s < max_s) { count = 0; float temp = max_s; for(int k = 0; k < units; k++) { float value = score[shift_s + k]; if(value < min_s) continue; count++; if(value < temp && value > min_s) temp = value; } if(count > active_units) min_s = temp; } //--- if(max_s == 0.0f) max_s = 1.0f; //--- for(int k = 0; k < units; k++) { float value = score[shift_s + k]; if(value < min_s) { score[shift_s + k] = 0.0f; continue; } value = exp(value / max_s / koef); score[shift_s + k] = value; sum += value; } //--- for(int k = 0; (k < units && sum > 1); k++) { float temp = score[shift_s + k]; if(temp == 0.0f) continue; score[shift_s + k] = temp / sum; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHSparseAttentionOut(__global float *scores, ///<[in] Matrix of Scores __global float *qkv, ///<[in] Matrix of Values __global float *out, ///<[out] Output tensor int dimension ///< Dimension of Value ) { int u = get_global_id(0); int units = get_global_size(0); int h = get_global_id(1); int heads = get_global_size(1); //--- int shift_s = units * (h + heads * u); int shift_out = dimension * (h + heads * u); //--- for(int d = 0; d < dimension; d++) { float result = 0; for(int v = 0; v < units; v++) { float cur_score = scores[shift_s + v]; if(cur_score == 0) continue; int shift_v = dimension * (h + heads * (3 * v + 2)) + d; result += cur_score * qkv[shift_v]; } out[shift_out + d] = result; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardMultiModels(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number of neurons in layer and n - number of outputs (neurons in next layer) __global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_o, ///<[out] Output tensor int inputs, ///< Number of inputs int activation ///< Activation type (#ENUM_ACTIVATION) ) { int i = get_global_id(0); int outputs = get_global_size(0); int m = get_global_id(1); int models = get_global_size(1); //--- float sum = 0; float4 inp, weight; int shift = (inputs + 1) * (i + outputs * m); int shift_in = inputs * m; int shift_out = outputs * m; //--- for(int k = 0; k <= inputs; k = k + 4) { switch(inputs - k) { case 0: inp = (float4)(1, 0, 0, 0); weight = (float4)(matrix_w[shift + k], 0, 0, 0); break; case 1: inp = (float4)(matrix_i[shift_in + k], 1, 0, 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0); break; case 2: inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], 1, 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0); break; case 3: inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], 1); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]); break; default: inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], matrix_i[shift_in + k + 3]); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]); break; } float d = dot(inp, weight); if(isnan(sum + d)) continue; sum += d; } if(isnan(sum)) sum = 0; //--- matrix_o[shift_out + i] = fActivation(sum, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientMultiModels(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number ///< of neurons in previous layer and n - number ///< of neurons in current layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_o, ///<[in] Previous layer Output tensor __global float *matrix_ig, ///<[out] Tensor of gradients at previous layer int outputs, ///< Number of outputs int activation, ///< Activation type (#ENUM_ACTIVATION), int model) { int i = get_global_id(0); int inputs = get_global_size(0); int m = get_global_id(1); int models = get_global_size(1); //--- int shift_in = inputs * m; if(model >= 0 && model != m) { matrix_ig[shift_in + i] = 0; return; } //--- int shift_out = outputs * m; int shift_w = (inputs + 1) * outputs * m; float sum = 0; float out = matrix_o[shift_in + i]; float4 grad, weight; //--- for(int k = 0; k < outputs; k += 4) { switch(outputs - k) { case 1: weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], 0, 0, 0); grad = (float4)(matrix_g[shift_out + k], 0, 0, 0); break; case 2: grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], 0, 0); weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], 0, 0); break; case 3: grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], matrix_g[shift_out + k + 2], 0); weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], matrix_w[shift_w + (k + 2) * (inputs + 1) + i], 0); break; default: grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], matrix_g[shift_out + k + 2], matrix_g[shift_out + k + 3]); weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], matrix_w[shift_w + (k + 2) * (inputs + 1) + i], matrix_w[shift_w + (k + 3) * (inputs + 1) + i]); break; } sum += dot(grad, weight); } //--- matrix_ig[shift_in + i] = Deactivation(sum, out, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsAdamMultiModels( __global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2, ///< Second momentum multiplier const int model) { const int outputs = get_global_size(0); const int i = get_global_id(0); const int j = get_global_id(1); const int wi = (i + outputs * model) * (inputs + 1) + j * 4; float4 m, v, weight, inp; int shift_in = j * 4 + inputs * model; if((inputs + 1 - j * 4) < 0) return; switch(inputs + 1 - j * 4) { case 0: inp = (float4)(1, 0, 0, 0); weight = (float4)(matrix_w[wi], 0, 0, 0); m = (float4)(matrix_m[wi], 0, 0, 0); v = (float4)(matrix_v[wi], 0, 0, 0); break; case 1: inp = (float4)(matrix_i[shift_in], 1, 0, 0); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0); m = (float4)(matrix_m[wi], matrix_m[wi + 1], 0, 0); v = (float4)(matrix_v[wi], matrix_v[wi + 1], 0, 0); break; case 2: inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], 1, 0); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0); m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], 0); v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], 0); break; case 3: inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], matrix_i[shift_in + 2], 1); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]); m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]); v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]); break; default: inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], matrix_i[shift_in + 2], matrix_i[shift_in + 3]); weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]); m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]); v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]); break; } float4 g = (float4)(matrix_g[(outputs + 1) * model + i]) * inp; float4 mt = b1 * m + (1 - b1) * g; float4 vt = b2 * v + (1 - b2) * (g * g); float4 delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); switch(min(inputs + 1 - j * 4, 3)) { case 3: if(fabs(delta.s3) > 0) matrix_w[wi + 3] = matrix_w[wi + 3] + delta.s3; matrix_m[wi + 3] = mt.s3; matrix_v[wi + 3] = vt.s3; case 2: if(fabs(delta.s2) > 0) matrix_w[wi + 2] = matrix_w[wi + 2] + delta.s2; matrix_m[wi + 2] = mt.s2; matrix_v[wi + 2] = vt.s2; case 1: if(fabs(delta.s1) > 0) matrix_w[wi + 1] = matrix_w[wi + 1] + delta.s1; matrix_m[wi + 1] = mt.s1; matrix_v[wi + 1] = vt.s1; case 0: if(fabs(delta.s0) > 0) matrix_w[wi] = matrix_w[wi] + delta.s0; matrix_m[wi] = mt.s0; matrix_v[wi] = vt.s0; break; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Concat_FeedForward(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number ///< of neurons in layer and n - number of outputs ///< (neurons in next layer) __global float *matrix_i1, ///<[in] Inputs 1 tensor __global float *matrix_i2, ///<[in] Inputs 2 tensor __global float *matrix_o, ///<[out] Output tensor int inputs1, ///< Number of inputs int inputs2, ///< Number of inputs int activation ///< Activation type (#ENUM_ACTIVATION) ) { int i = get_global_id(0); float sum = 0; float4 inp, weight; int shift = (inputs1 + inputs2 + 1) * i; //--- //--- for(int k = 0; k < inputs1; k += 4) { switch(inputs1 - k) { case 1: inp = (float4)(matrix_i1[k], 0, 0, 0); weight = (float4)(matrix_w[shift + k], 0, 0, 0); break; case 2: inp = (float4)(matrix_i1[k], matrix_i1[k + 1], 0, 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0); break; case 3: inp = (float4)(matrix_i1[k], matrix_i1[k + 1], matrix_i1[k + 2], 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0); break; default: inp = (float4)(matrix_i1[k], matrix_i1[k + 1], matrix_i1[k + 2], matrix_i1[k + 3]); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]); break; } float d = dot(inp, weight); if(isnan(sum + d)) continue; sum += d; } //--- shift += inputs1; //--- for(int k = 0; k < inputs2; k += 4) { switch(inputs2 - k) { case 1: inp = (float4)(matrix_i2[k], 0, 0, 0); weight = (float4)(matrix_w[shift + k], 0, 0, 0); break; case 2: inp = (float4)(matrix_i2[k], matrix_i2[k + 1], 0, 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0); break; case 3: inp = (float4)(matrix_i2[k], matrix_i2[k + 1], matrix_i2[k + 2], 0); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0); break; default: inp = (float4)(matrix_i2[k], matrix_i2[k + 1], matrix_i2[k + 2], matrix_i2[k + 3]); weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]); break; } float d = dot(inp, weight); if(isnan(sum + d)) continue; sum += d; } sum += matrix_w[shift + inputs2]; //--- if(isnan(sum)) sum = 0; //--- matrix_o[i] = fActivation(sum, activation);; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Concat_HiddenGradient(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_o1, ///<[in] Previous layer Output tensor __global float *matrix_o2, ///<[in] Previous layer Output tensor __global float *matrix_ig1, ///<[out] Tensor of gradients at previous layer __global float *matrix_ig2, ///<[out] Tensor of gradients at previous layer int outputs, ///< Number of outputs int inputs1, int inputs2, int activation1, ///< Activation type (#ENUM_ACTIVATION) int activation2 ///< Activation type (#ENUM_ACTIVATION) ) { int i = get_global_id(0); if(i >= (inputs1 + inputs2)) return; int inputs = inputs1 + inputs2; float sum = 0; float out = (i < inputs1 ? matrix_o1[i] : matrix_o2[i - inputs1]); float4 grad, weight; //--- for(int k = 0; k < outputs; k += 4) { switch(outputs - k) { case 1: weight = (float4)(matrix_w[k * (inputs + 1) + i], 0, 0, 0); grad = (float4)(matrix_g[k], 0, 0, 0); break; case 2: grad = (float4)(matrix_g[k], matrix_g[k + 1], 0, 0); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], 0, 0); break; case 3: grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], 0); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], 0); break; default: grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], matrix_g[k + 3]); weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], matrix_w[(k + 3) * (inputs + 1) + i]); break; } sum += dot(grad, weight); } if(isnan(sum)) sum = 0; if(i < inputs1) matrix_ig1[i] = Deactivation(sum, out, activation1); else matrix_ig2[i - inputs1] = Deactivation(sum, out, activation2); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Concat_UpdateWeightsMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer __global float *matrix_g, ///<[in] Tensor of gradients at current layer __global float *matrix_i1, ///<[in] Inputs tensor __global float *matrix_i2, ///<[in] Inputs tensor __global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction int inputs1, ///< Number of inputs int inputs2, ///< Number of inputs float learning_rates, ///< Learning rates float momentum ///< Momentum multiplier ) { int i = get_global_id(0); int j = get_global_id(1); if(j > (inputs1 + inputs2)) return; int wi = i * (inputs1 + inputs2 + 1) + j; float inp = (j < inputs1 ? matrix_i1[j] : ((j - inputs1) < inputs2 ? matrix_i2[j - inputs1] : 1)); float delta = learning_rates * matrix_g[i] * inp + momentum * matrix_dw[wi]; if(!isnan(delta)) { matrix_dw[wi] = delta; if(fabs(delta) > 0) matrix_w[wi] = matrix_w[wi] + delta; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Concat_UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i1, ///<[in] Inputs tensor __global const float *matrix_i2, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const int inputs1, ///< Number of inputs const int inputs2, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int i = get_global_id(0); const int j = get_global_id(1); if(j > (inputs1 + inputs2)) return; const int wi = i * (inputs1 + inputs2 + 1) + j; float inp = (j < inputs1 ? matrix_i1[j] : ((j - inputs1) < inputs2 ? matrix_i2[j - inputs1] : 1)); float weight = matrix_w[wi]; float g = matrix_g[i] * inp; float mt = b1 * matrix_m[wi] + (1 - b1) * g; float vt = b2 * matrix_v[wi] + (1 - b2) * (g * g); float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); if(fabs(delta) > 0) matrix_w[wi] = matrix_w[wi] + delta; matrix_m[wi] = mt; matrix_v[wi] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SoftUpdate(__global float *target, ///<[in,out] Target matrix __global const float *source, ///<[in] Source matrix const float tau ///<[in] Multiplicator Tau ) { const int i = get_global_id(0); target[i] = source[i] * tau + (1.0f - tau) * target[i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SoftUpdateAdam(__global float *target, __global const float *source, __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const float tau, ///<[in] Multiplicator Tau const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int i = get_global_id(0); float m, v, weight; m = matrix_m[i]; v = matrix_v[i]; weight = target[i]; float g = source[i] - weight; m = b1 * m + (1 - b1) * g; v = b2 * v + (1 - b2) * (g * g); float delta = (1 - tau) * m / (v != 0.0f ? sqrt(v) : 1.0f); if(fabs(delta) > 0) target[i] = weight + delta; matrix_m[i] = m; matrix_v[i] = v; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SAC_AlphaLogProbs(__global float *outputs, __global float *quantiles, __global float *probs, __global float *alphas, __global float *log_probs, __global float *random, const int count_quants, const int activation) { const int i = get_global_id(0); int shift = i * count_quants; float prob = 0; float value = 0; float sum = 0; float rnd = random[i]; //--- for(int r = 0; r < count_quants; r++) { prob = probs[shift + r]; sum += prob; if(sum >= rnd || r == (count_quants - 1)) { value = quantiles[shift + r]; break; } } //--- outputs[i] = fActivation(value, activation); log_probs[i] = -alphas[i] * log(prob); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SAC_AlphaGradients(__global float *outputs, __global float *gradient, __global float *log_probs, __global float *alphas_grad, const int activation) { const int i = get_global_id(0); float out = outputs[i]; //--- float grad = -gradient[i] * log_probs[i]; //--- alphas_grad[i] = Deactivation(grad, out, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SAC_OutputGradient(__global float *quantiles, __global float *delta_taus, __global float *output_gr, __global float *quantiles_gr, __global float *taus_gr, __global float *output, const int count_quants, const int activation) { size_t action = get_global_id(0); int shift = action * count_quants; float quant1 = -1e37f; float quant2 = 1e37f; int pos1 = -1; int pos2 = -1; float value = output[action]; //--- for(int i = 0; i < count_quants; i++) { float quant = fActivation(quantiles[shift + i], activation); if(value >= quant && quant1 < quant) { quant1 = quant; pos1 = shift + i; } if(value < quant && quant2 > quant) { quant2 = quant; pos2 = shift + i; } quantiles_gr[shift + i] = 0.0f; taus_gr[shift + i] = 0.0f; } float gradient = output_gr[action]; if(quant1 > -1e37f) { quantiles_gr[pos1] = gradient * delta_taus[pos1]; taus_gr[pos1] = gradient * quant1; } if(quant2 < 1e37f) { quantiles_gr[pos2] = gradient * delta_taus[pos2]; taus_gr[pos2] = gradient * quant2; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SAC_CalcLogProbs(__global float *outputs, __global float *quantiles, __global float *probs, __global float *alphas, __global float *log_probs, const int count_quants, const int activation) { const int i = get_global_id(0); int shift = i * count_quants; float quant1 = -1e37f; float quant2 = 1e37f; float prob1 = 0; float prob2 = 0; float value = outputs[i]; //--- for(int q = 0; q < count_quants; q++) { float quant = fActivation(quantiles[shift + q], activation); if(value >= quant && quant1 < quant) { quant1 = quant; prob1 = probs[shift + q]; } if(value < quant && quant2 > quant) { quant2 = quant; prob2 = probs[shift + q]; } } //--- float prob = fabs(value - quant1) / fabs(quant2 - quant1); prob = clamp((1 - prob) * prob1 + prob * prob2, 1.0e-3f, 1.0f); log_probs[i] = -alphas[i] * log(prob); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Embedding(__global float *inputs, __global float *outputs, __global float *weights, __global int *windows, __global float *std, const int stack_size) { const int window_out = get_global_size(0); const int pos = get_global_id(0); const int emb = get_global_id(1); const int emb_total = get_global_size(1); const int shift_out = emb * window_out + pos; const int step = emb_total * window_out; const uint ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE); //--- for(int i = stack_size - 1; i > 0; i--) outputs[i * step + shift_out] = outputs[(i - 1) * step + shift_out]; int shift_in = 0; //--- for(int i = 0; i < emb; i++) shift_in += windows[i]; const int window_in = windows[emb]; const int shift_weights = (shift_in + emb) * window_out + (window_in + 1) * pos; //--- __local float temp[LOCAL_ARRAY_SIZE]; if(pos < LOCAL_ARRAY_SIZE) temp[pos] = 0; BarrierLoc //--- float value = weights[shift_weights + window_in]; //--- for(int i = 0; i < window_in; i++) value += inputs[shift_in + i] * weights[shift_weights + i]; //--- for(int i = 0; i < window_out; i += ls) { if(pos >= i && pos < (i + ls)) temp[pos % ls] += value; BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(pos + count < ls) { if(pos < count) temp[pos] += temp[pos + count]; temp[pos + count] = 0; } BarrierLoc } while(count > 1); //--- value -= temp[0] / (float)window_out; BarrierLoc //--- if(pos < LOCAL_ARRAY_SIZE) temp[pos] = 0; BarrierLoc //--- for(int i = 0; i < window_out; i += ls) { if(pos >= i && pos < (i + ls)) temp[pos % ls] += (value * value) / (float)window_out; BarrierLoc } //--- count = ls; do { count = (count + 1) / 2; if(pos + count < ls) { if(pos < count) temp[pos] += temp[pos + count]; temp[pos + count] = 0; } BarrierLoc } while(count > 1); //--- if(temp[0] > 0) value /= sqrt(temp[0]); //--- outputs[shift_out] = value; if(pos == 0) std[emb] = sqrt(temp[0]); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void EmbeddingHiddenGradient(__global float *inputs_gradient, __global float *outputs_gradient, __global float *weights, __global int *windows, __global float *std, const int window_out) { const int pos = get_global_id(0); int emb = -1; int count = 0; do { emb++; count += windows[emb]; } while(count < pos); const int shift_out = emb * window_out; const int shift_weights = pos + (count - windows[emb] + emb) * window_out; //--- float value = 0; //--- for(int i = 0; i < window_out; i++) value += outputs_gradient[shift_out + i] * weights[shift_weights + i * window_out]; float s = std[emb]; if(s > 0) value /= s; //--- inputs_gradient[pos] = value; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void EmbeddingUpdateWeightsAdam(__global float *weights, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *gradient, ///<[in] Tensor of gradients at current layer __global const float *inputs, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum __global int *windows, __global float *std, const int window_out, const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int i = get_global_id(0); int emb = -1; int count = 0; int shift = 0; int window_in = 0; do { emb++; shift = count; window_in = windows[emb]; count += (window_in + 1) * window_out; } while(count <= i); const int shift_out = emb * window_out; int shift_in = shift / window_out - emb; shift = (i - shift) % (window_in + 1); float inp = 1.0f; if(shift < window_in) inp = inputs[shift_in + shift]; //--- float weight = weights[i]; float g = gradient[shift_out] * inp / std[emb]; float mt = b1 * matrix_m[i] + (1 - b1) * g; float vt = b2 * matrix_v[i] + (1 - b2) * (g * g); float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); if(fabs(delta) > 0) weights[i] = weights[i] + delta; matrix_m[i] = mt; matrix_v[i] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Transpose(__global float *matrix_in, ///<[in] Input matrix __global float *matrix_out ///<[out] Output matrix ) { const int r = get_global_id(0); const int c = get_global_id(1); const int rows = get_global_size(0); const int cols = get_global_size(1); //--- matrix_out[c * rows + r] = matrix_in[r * cols + c]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MH2AttentionOut(__global float *q, ///<[in] Matrix of Querys __global float *kv, ///<[in] Matrix of Keys __global float *score, ///<[out] Matrix of Scores __global float *out, ///<[out] Matrix of attention int dimension, ///< Dimension of Key int heads_kv, int mask ///< 1 - calc only previous units, 0 - calc all ) { //--- init const int q_id = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_local_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_k = dimension * (2 * heads_kv * k + h_kv); const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv); const int shift_s = kunits * (q_id * heads + h) + k; const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); float koef = sqrt((float)dimension); if(koef < 1) koef = 1; __local float temp[LOCAL_ARRAY_SIZE]; //--- Score float sum = MIN_VALUE; if(mask == 0 || q_id >= k) { sum = 0; for(int d = 0; d < dimension; d++) sum += q[shift_q + d] * kv[shift_k + d]; } float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp); score[shift_s] = sc; //--- out for(int d = 0; d < dimension; d++) { BarrierLoc sum = LocalSum(IsNaNOrInf(kv[shift_v + d ] * sc, 0), 1, temp); //--- if(k == 0) out[shift_q + d] = sum; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MH2AttentionInsideGradients(__global float *q, __global float *q_g, __global float *kv, __global float *kv_g, __global float *scores, __global float *gradient, int kunits, int heads_kv) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = q_id * kunits * heads + h * kunits; const int shift_g = h * dimension + d; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v = q_id; v < kunits; v += qunits) { float grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v; for(int g = 0; g < qunits; g++) grad += gradient[shift_g + dimension * (hq - h + g * heads)] * scores[shift_score + g * step_score]; } int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d; kv_g[shift_v] = grad; } } //--- Calculating Query's gradients float grad = 0; float out_g = gradient[shift_g + q_id * dimension]; int shift_val = (heads_kv + h_kv) * dimension + d; int shift_key = h_kv * dimension + d; //--- for(int k = 0; k < kunits; k++) { float sc_g = 0; float sc = scores[shift_s + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] * ((float)(k == v) - sc); grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension]; } q_g[shift_q] = grad / koef; //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k = q_id; k < kunits; k += qunits) { int shift_k = dimension * (2 * heads_kv * k + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq++) { int shift_score = hq * kunits + k; float val = kv[shift_k + heads_kv * dimension]; for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] * val * ((float)(k == v) - sc); grad += sc_g * q[shift_q + scr * dimension]; } } kv_g[shift_k] = grad / koef; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CGConv_HiddenGradient(__global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_f, ///<[in] Previous layer Output tensor __global const float *matrix_s, ///<[in] Previous layer Output tensor __global float *matrix_fg, ///<[out] Tensor of gradients at previous layer __global float *matrix_sg, ///<[out] Tensor of gradients at previous layer const int activationf, ///< Activation type (#ENUM_ACTIVATION) const int activations ///< Activation type (#ENUM_ACTIVATION) ) { int i = get_global_id(0); //--- float grad = matrix_g[i]; float f = matrix_f[i]; float s = matrix_s[i]; //--- float sg = grad * f; float fg = grad * s; //--- matrix_fg[i] = Deactivation(fg, f, activationf); matrix_sg[i] = Deactivation(sg, s, activations); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void XCiTFeedForward(__global float *qkv, __global float *score, __global float *out) { const size_t d = get_local_id(0); const size_t dimension = get_local_size(0); const size_t u = get_local_id(1); const size_t units = get_local_size(1); const size_t h = get_global_id(2); const size_t heads = get_global_size(2); //--- const uint ls_u = min((uint)units, (uint)LOCAL_ARRAY_SIZE); const uint ls_d = min((uint)dimension, (uint)LOCAL_ARRAY_SIZE); __local float q[LOCAL_ARRAY_SIZE][LOCAL_ARRAY_SIZE]; __local float k[LOCAL_ARRAY_SIZE][LOCAL_ARRAY_SIZE]; //--- Normalize Query and Key for(int cur_d = 0; cur_d < dimension; cur_d += ls_d) { float q_val = 0; float k_val = 0; //--- if(d < ls_d && (cur_d + d) < dimension && u < ls_u) { for(int count = u; count < units; count += ls_u) { int shift = count * dimension * heads * 3 + dimension * h + cur_d + d; q_val += qkv[shift] * qkv[shift]; k_val += qkv[shift + dimension * heads] * qkv[shift + dimension * heads]; } q[u][d] = q_val; k[u][d] = k_val; } BarrierLoc //--- uint count = ls_u; do { count = (count + 1) / 2; if(d < ls_d) { if(u < ls_u && u < count && (u + count) < units) { float q_val = q[u][d] + q[u + count][d]; float k_val = k[u][d] + k[u + count][d]; q[u + count][d] = 0; k[u + count][d] = 0; q[u][d] = q_val; k[u][d] = k_val; } } BarrierLoc } while(count > 1); //--- int shift = u * dimension * heads * 3 + dimension * h + cur_d; qkv[shift] = qkv[shift] / sqrt(q[0][d]); qkv[shift + dimension * heads] = qkv[shift + dimension * heads] / sqrt(k[0][d]); BarrierLoc } //--- Score int step = dimension * heads * 3; //--- for(int cur_r = 0; cur_r < dimension; cur_r += ls_u) { for(int cur_d = 0; cur_d < dimension; cur_d += ls_d) { if(u < ls_d && d < ls_d) q[u][d] = 0; BarrierLoc //--- if((cur_r + u) < ls_d && (cur_d + d) < ls_d) { int shift_q = dimension * h + cur_d + d; int shift_k = dimension * (heads + h) + cur_r + u; float scr = 0; for(int i = 0; i < units; i++) scr += qkv[shift_q + i * step] * qkv[shift_k + i * step]; scr = exp(scr / sqrt((float)units)); score[(cur_r + u) * dimension * heads + dimension * h + cur_d + d] = scr; q[u][d] += scr; } } BarrierLoc //--- int count = ls_d; do { count = (count + 1) / 2; if(u < ls_d) { if(d < ls_d && d < count && (d + count) < dimension) q[u][d] += q[u][d + count]; if(d + count < ls_d) q[u][d + count] = 0; } BarrierLoc } while(count > 1); //--- if((cur_r + u) < ls_d) score[(cur_r + u) * dimension * heads + dimension * h + d] /= q[u][0]; BarrierLoc } //--- int shift_out = dimension * (u * heads + h) + d; int shift_s = dimension * (heads * d + h); int shift_v = dimension * (heads * (u * 3 + 2) + h); float sum = 0; //--- for(int i = 0; i < dimension; i++) sum += qkv[shift_v + i] * score[shift_s + i]; out[shift_out] = sum; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void XCiTInsideGradients(__global float *qkv, __global float *qkv_g, __global float *scores, __global float *gradient) { //--- init const int q = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int units = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int shift_q = dimension * (heads * 3 * q + h); const int shift_k = dimension * (heads * (3 * q + 1) + h); const int shift_v = dimension * (heads * (3 * q + 2) + h); const int shift_g = dimension * (heads * q + h); int shift_score = dimension * h; int step_score = dimension * heads; //--- Calculating Value's gradients float sum = 0; //--- for(int i = 0; i < dimension; i++) sum += gradient[shift_g + i] * scores[shift_score + d + i * step_score]; qkv_g[shift_v + d] = sum; //--- Calculating Query's gradients float grad = 0; float val = qkv[shift_v + d]; //--- for(int k = 0; k < dimension; k++) { float sc_g = 0; float sc = scores[shift_score + k]; for(int v = 0; v < dimension; v++) sc_g += scores[shift_score + v] * val * gradient[shift_g + v * dimension] * ((float)(k == v) - sc); grad += sc_g * qkv[shift_k + k]; } qkv_g[shift_q + d] = grad / sqrt((float)units); //--- Calculating Key's gradients grad = 0; float out_g = gradient[shift_g]; //--- for(int scr = 0; scr < dimension; scr++) { float sc_g = 0; int shift_sc = scr * dimension * heads; float sc = scores[shift_sc + d]; for(int v = 0; v < dimension; v++) sc_g += scores[shift_sc + v] * out_g * qkv[shift_v + v] * ((float)(d == v) - sc); grad += sc_g * qkv[shift_q + scr]; } qkv_g[shift_k + d] = grad / sqrt((float)units); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DOTFeedForward(__global float *qkv, __global float *score, __global float *rpb, __global float *out) { const size_t d = get_local_id(0); const size_t dimension = get_local_size(0); const size_t u = get_global_id(1); const size_t units = get_global_size(1); const size_t h = get_global_id(2); const size_t heads = get_global_size(2); //--- uint step = 3 * dimension * heads; uint start = max((int)u - 1, 0); uint stop = min((int)u + 1, (int)units - 1); uint shift_q = u * step + h * dimension; uint shift_k = start * step + dimension * (heads + h); uint shift_score = u * 3 * heads; //--- const uint ls_d = min((uint)dimension, (uint)LOCAL_ARRAY_SIZE); __local float temp[LOCAL_ARRAY_SIZE][3]; //--- Score if(d < ls_d) { //--- for(uint pos = start; pos <= stop; pos++) temp[d][pos - start] = 0; //--- for(uint dim = d; dim < dimension; dim += ls_d) { float q = qkv[shift_q + dim]; for(uint pos = start; pos <= stop; pos++) { uint i = pos - start; temp[d][i] = temp[d][i] + q * qkv[shift_k + i * step + dim]; } } BarrierLoc //--- int count = ls_d; //--- do { count = (count + 1) / 2; if(d < count && (d + count) < dimension) for(uint i = 0; i <= (stop - start); i++) { temp[d][i] += temp[d + count][i]; temp[d + count][i] = 0; } BarrierLoc } while(count > 1); } //--- if(d == 0) { float sum = 0; //--- for(uint i = 0; i <= (stop - start); i++) { temp[0][i] = exp(temp[0][i] + rpb[shift_score + i]); sum += temp[0][i]; } //--- for(uint i = 0; i <= (stop - start); i++) { temp[0][i] = temp[0][i] / sum; score[shift_score + i] = temp[0][i]; } } BarrierLoc //--- int shift_out = dimension * (u * heads + h) + d; int shift_v = dimension * (heads * (u * 3 + 2) + h); float sum = 0; //--- for(uint i = 0; i <= (stop - start); i++) sum += qkv[shift_v + i] * temp[0][i]; out[shift_out] = sum; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DOTInsideGradients(__global float *qkv, __global float *qkv_g, __global float *scores, __global float *rpb, __global float *rpb_g, __global float *gradient) { //--- init const uint u = get_global_id(0); const uint d = get_global_id(1); const uint h = get_global_id(2); const uint units = get_global_size(0); const uint dimension = get_global_size(1); const uint heads = get_global_size(2); //--- uint step = 3 * dimension * heads; uint start = max((int)u - 1, 0); uint stop = min((int)u + 1, (int)units - 1); const uint shift_q = u * step + dimension * h + d; const uint shift_k = u * step + dimension * (heads + h) + d; const uint shift_v = u * step + dimension * (2 * heads + h) + d; //--- Calculating Value's gradients float sum = 0; //--- for(uint i = start; i <= stop; i++) { int shift_score = i * 3 * heads; if(u == i) { shift_score += (uint)(u > 0); } else { if(u > i) shift_score += (uint)(start > 0) + 1; } uint shift_g = dimension * (i * heads + h) + d; sum += gradient[shift_g] * scores[shift_score]; } qkv_g[shift_v] = sum; //--- Calculating Query's gradients float grad = 0; uint shift_score = u * heads * 3; //--- for(int k = start; k <= stop; k++) { float sc_g = 0; float sc = scores[shift_score + k - start]; for(int v = start; v <= stop; v++) for(int dim = 0; dim < dimension; dim++) sc_g += scores[shift_score + v - start] * qkv[v * step + dimension * (2 * heads + h) + dim] * gradient[dimension * (u * heads + h) + dim] * ((float)(k == v) - sc); grad += sc_g * qkv[k * step + dimension * (heads + h) + d]; if(d == 0) rpb_g[shift_score + k - start] = sc_g; } qkv_g[shift_q] = grad; //--- Calculating Key's gradients grad = 0; //--- for(int q = start; q <= stop; q++) { float sc_g = 0; shift_score = q * heads * 3; if(u == q) shift_score += (uint)(u > 0); else { if(u > q) shift_score += (uint)(start > 0) + 1; } float sc = scores[shift_score]; for(int v = start; v <= stop; v++) { shift_score = v * heads * 3; if(u == v) shift_score += (uint)(u > 0); else { if(u > v) shift_score += (uint)(start > 0) + 1; } for(int dim = 0; dim < dimension; dim++) sc_g += scores[shift_score] * qkv[shift_v - d + dim] * gradient[dimension * (v * heads + h) + d] * ((float)(d == v) - sc); } grad += sc_g * qkv[q * step + dimension * h + d]; } qkv_g[shift_k] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void RPBUpdateAdam(__global float *target, __global const float *gradient, __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int i = get_global_id(0); float m, v, weight; m = matrix_m[i]; v = matrix_v[i]; weight = target[i]; float g = gradient[i]; m = b1 * m + (1 - b1) * g; v = b2 * v + (1 - b2) * (g * g); float delta = m / (v != 0.0f ? sqrt(v) : 1.0f); target[i] = weight + delta; matrix_m[i] = m; matrix_v[i] = v; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GTEFeedForward(__global float *qkv, __global float *score, __global float *out, int dimension) { const size_t cur_q = get_global_id(0); const size_t units_q = get_global_size(0); const size_t cur_k = get_local_id(1); const size_t units_k = get_local_size(1); const size_t h = get_global_id(2); const size_t heads = get_global_size(2); //--- int shift_q = dimension * (cur_q + h * units_q); int shift_k = (cur_k + h * units_k + heads * units_q); int shift_v = dimension * (h * units_k + heads * (units_q + units_k)); int shift_score_con = units_k * (cur_q * 2 * heads + h) + cur_k; int shift_score_notcon = units_k * (cur_q * 2 * heads + heads + h) + cur_k; int shift_out_con = dimension * (cur_q + h * units_q); int shift_out_notcon = dimension * (cur_q + units_q * (h + heads)); //--- const uint ls_score = min((uint)units_k, (uint)LOCAL_ARRAY_SIZE); __local float local_score[LOCAL_ARRAY_SIZE][2]; //--- Score float scr = 0; //--- for(int d = 0; d < dimension; d++) scr += qkv[shift_q + d] * qkv[shift_k + d]; scr = exp(min(scr / sqrt((float)dimension), 30.0f)); if(cur_q == cur_k) { score[shift_score_con] = scr; score[shift_score_notcon] = scr; if(cur_k < ls_score) { local_score[cur_k][0] = scr; local_score[cur_k][1] = scr; } } else { if(abs(cur_q - cur_k) == 1) { score[shift_score_con] = scr; score[shift_score_notcon] = 0; if(cur_k < ls_score) { local_score[cur_k][0] = scr; local_score[cur_k][1] = 0; } } else { score[shift_score_con] = 0; score[shift_score_notcon] = scr; if(cur_k < ls_score) { local_score[cur_k][0] = 0; local_score[cur_k][1] = scr; } } } BarrierLoc //--- for(int k = ls_score; k < units_k; k += ls_score) { if((cur_k + k) < units_k) { local_score[cur_k][0] += score[shift_score_con + k]; local_score[cur_k][1] += score[shift_score_notcon + k]; } } BarrierLoc //--- int count = ls_score; do { count = (count + 1) / 2; if(cur_k < count) { if((cur_k + count) < units_k) { local_score[cur_k][0] += local_score[cur_k + count][0]; local_score[cur_k][1] += local_score[cur_k + count][1]; local_score[cur_k + count][0] = 0; local_score[cur_k + count][1] = 0; } } BarrierLoc } while(count > 1); BarrierLoc //--- score[shift_score_con] /= local_score[0][0]; score[shift_score_notcon] /= local_score[0][1]; BarrierLoc //--- shift_score_con -= cur_k; shift_score_notcon -= cur_k; //--- for(int d = 0; d < dimension; d += ls_score) { if((cur_k + d) < dimension) { float sum_con = 0; float sum_notcon = 0; for(int v = 0; v < units_k; v++) { sum_con += qkv[shift_v + v * dimension + cur_k + d] * score[shift_score_con + v]; sum_notcon += qkv[shift_v + v * dimension + cur_k + d] * score[shift_score_notcon + v]; } out[shift_out_con + cur_k + d] = sum_con; out[shift_out_notcon + cur_k + d] = sum_notcon; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GTEInsideGradients(__global float *qkv, __global float *qkv_g, __global float *scores, __global float *gradient) { //--- init const uint u = get_global_id(0); const uint d = get_global_id(1); const uint h = get_global_id(2); const uint units = get_global_size(0); const uint dimension = get_global_size(1); const uint heads = get_global_size(2); //--- Calculating Value's gradients { int shift_out_con = dimension * h * units + d; int shift_out_notcon = dimension * units * (h + heads) + d; int shift_score_con = units * h + u; int shift_score_notcon = units * (heads + h) + u; int step_score = units * 2 * heads; int shift_v = dimension * (h * units + 2 * heads * units + u) + d; //--- float sum = 0; //--- for(uint i = 0; i <= units; i++) { sum += gradient[shift_out_con + i * dimension] * scores[shift_score_con + i * step_score]; sum += gradient[shift_out_notcon + i * dimension] * scores[shift_score_notcon + i * step_score]; } qkv_g[shift_v] = sum; } //--- Calculating Query's gradients { int shift_q = dimension * (u + h * units) + d; int shift_out_con = dimension * (h * units + u) + d; int shift_out_notcon = dimension * (u + units * (h + heads)) + d; int shift_score_con = units * h; int shift_score_notcon = units * (heads + h); int shift_v = dimension * (h * units + 2 * heads * units); float grad = 0; //--- for(int k = 0; k < units; k++) { int shift_k = (k + h * units + heads * units) + d; float sc_g = 0; float sc_con = scores[shift_score_con + k]; float sc_notcon = scores[shift_score_notcon + k]; for(int v = 0; v < units; v++) for(int dim = 0; dim < dimension; dim++) { sc_g += scores[shift_score_con + v] * qkv[shift_v + v * dimension + dim] * gradient[shift_out_con + dim] * ((float)(k == v) - sc_con); sc_g += scores[shift_score_notcon + v] * qkv[shift_v + v * dimension + dim] * gradient[shift_out_notcon + dim] * ((float)(k == v) - sc_notcon); } grad += sc_g * qkv[shift_k]; } qkv_g[shift_q] = grad; } //--- Calculating Key's gradients { int shift_k = (u + (h + heads) * units) + d; int shift_out_con = dimension * h * units + d; int shift_out_notcon = dimension * units * (h + heads) + d; int shift_score_con = units * h + u; int shift_score_notcon = units * (heads + h) + u; int step_score = units * 2 * heads; int shift_v = dimension * (h * units + 2 * heads * units); float grad = 0; //--- for(int q = 0; q < units; q++) { int shift_q = dimension * (q + h * units) + d; float sc_g = 0; float sc_con = scores[shift_score_con + u + q * step_score]; float sc_notcon = scores[shift_score_notcon + u + q * step_score]; for(int g = 0; g < units; g++) { for(int dim = 0; dim < dimension; dim++) { sc_g += scores[shift_score_con + g] * qkv[shift_v + u * dimension + dim] * gradient[shift_out_con + g * dimension + dim] * ((float)(u == g) - sc_con); sc_g += scores[shift_score_notcon + g] * qkv[shift_v + u * dimension + dim] * gradient[shift_out_notcon + g * dimension + dim] * ((float)(u == g) - sc_notcon); } } grad += sc_g * qkv[shift_q]; } qkv_g[shift_k] = grad; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardNODEF(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_o, ///<[out] Output tensor int dimension, ///< input dimension float step, ///< h int activation ///< Activation type (#ENUM_ACTIVATION) ) { int d = get_global_id(0); int dimension_out = get_global_size(0); int v = get_global_id(1); int variables = get_global_size(1); int i = get_global_id(2); int lenth = get_global_size(2); //--- int shift = variables * i + v; int input_shift = shift * dimension; int output_shift = shift * dimension_out + d; int weight_shift = (v * dimension_out + d) * (dimension + 2); //--- float sum = matrix_w[dimension + 1 + weight_shift] + matrix_w[dimension + weight_shift] * step; //--- for(int w = 0; w < dimension; w++) sum += matrix_w[w + weight_shift] * matrix_i[input_shift + w]; //--- if(isnan(sum)) sum = 0; //--- matrix_o[output_shift] = fActivation(sum, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardNODEInpK(__global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_k1, ///<[in] K1 tensor __global float *matrix_k2, ///<[in] K2 tensor __global float *matrix_k3, ///<[in] K3 tensor __global float *matrix_k4, ///<[in] K4 tensor __global float *matrix_k5, ///<[in] K5 tensor __global float *matrix_k6, ///<[in] K6 tensor __global float *matrix_beta, ///<[in] beta tensor __global float *matrix_o ///<[out] Output tensor ) { int i = get_global_id(0); //--- float sum = matrix_i[i]; //--- for(int b = 0; b < 6; b++) { float beta = matrix_beta[b]; if(beta == 0.0f || isnan(beta)) continue; //--- float val = 0.0f; switch(b) { case 0: val = matrix_k1[i]; break; case 1: val = matrix_k2[i]; break; case 2: val = matrix_k3[i]; break; case 3: val = matrix_k4[i]; break; case 4: val = matrix_k5[i]; break; case 5: val = matrix_k6[i]; break; } if(val == 0.0f || isnan(val)) continue; //--- sum += val * beta; } //--- matrix_o[i] = sum; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HiddenGradientNODEInpK(__global float *matrix_ig, ///<[in] Inputs tensor __global float *matrix_k1g, ///<[in] K1 tensor __global float *matrix_k2g, ///<[in] K2 tensor __global float *matrix_k3g, ///<[in] K3 tensor __global float *matrix_k4g, ///<[in] K4 tensor __global float *matrix_k5g, ///<[in] K5 tensor __global float *matrix_k6g, ///<[in] K6 tensor __global float *matrix_beta, ///<[in] beta tensor __global float *matrix_og ///<[out] Output tensor ) { int i = get_global_id(0); //--- float grad = IsNaNOrInf(matrix_og[i], 0); matrix_ig[i] = grad; //--- for(int b = 0; b < 6; b++) { float beta = IsNaNOrInf(matrix_beta[b], 0.0f); //--- float val = IsNaNOrInf(beta * grad, 0.0f); switch(b) { case 0: matrix_k1g[i] = val; break; case 1: matrix_k2g[i] = val; break; case 2: matrix_k3g[i] = val; break; case 3: matrix_k4g[i] = val; break; case 4: matrix_k5g[i] = val; break; case 5: matrix_k6g[i] = val; break; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HiddenGradientNODEF(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global float *matrix_g, ///<[in] Gradient tensor __global float *matrix_i, ///<[in] Inputs tensor __global float *matrix_ig, ///<[out] Inputs Gradient tensor int dimension_out, ///< output dimension int activation ///< Input Activation type (#ENUM_ACTIVATION) ) { int d = get_global_id(0); int dimension = get_global_size(0); int v = get_global_id(1); int variables = get_global_size(1); int i = get_global_id(2); int lenth = get_global_size(2); //--- int shift = variables * i + v; int input_shift = shift * dimension + d; int output_shift = shift * dimension_out; int weight_step = (dimension + 2); int weight_shift = (v * dimension_out) * weight_step + d; //--- float sum = 0; //--- for(int k = 0; k < dimension_out; k++) sum += matrix_g[output_shift + k] * matrix_w[weight_shift + k * weight_step]; if(isnan(sum)) sum = 0; //--- float out = matrix_i[input_shift]; //--- matrix_ig[input_shift] = Deactivation(sum, out, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void NODEF_UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_gk1, ///<[in] Tensor of gradients at k1 __global const float *matrix_gk2, ///<[in] Tensor of gradients at k2 __global const float *matrix_gk3, ///<[in] Tensor of gradients at k3 __global const float *matrix_gk4, ///<[in] Tensor of gradients at k4 __global const float *matrix_gk5, ///<[in] Tensor of gradients at k5 __global const float *matrix_gk6, ///<[in] Tensor of gradients at k6 __global const float *matrix_ik1, ///<[in] Inputs tensor __global const float *matrix_ik2, ///<[in] Inputs tensor __global const float *matrix_ik3, ///<[in] Inputs tensor __global const float *matrix_ik4, ///<[in] Inputs tensor __global const float *matrix_ik5, ///<[in] Inputs tensor __global const float *matrix_ik6, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum __global const float *alpha, ///< h const int lenth, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { const int d_in = get_global_id(0); const int dimension_in = get_global_size(0); const int d_out = get_global_id(1); const int dimension_out = get_global_size(1); const int v = get_global_id(2); const int variables = get_global_id(2); //--- const int weight_shift = (v * dimension_out + d_out) * dimension_in; const int input_step = variables * (dimension_in - 2); const int input_shift = v * (dimension_in - 2) + d_in; const int output_step = variables * dimension_out; const int output_shift = v * dimension_out + d_out; //--- float weight = matrix_w[weight_shift]; float g = 0; //--- for(int i = 0; i < lenth; i++) { int shift_g = i * output_step + output_shift; int shift_i = i * input_step + input_shift; switch(dimension_in - d_in) { case 1: g += matrix_gk1[shift_g] + matrix_gk2[shift_g] + matrix_gk3[shift_g] + matrix_gk4[shift_g] + matrix_gk5[shift_g] + matrix_gk6[shift_g]; break; case 2: g += matrix_gk1[shift_g] * alpha[0] + matrix_gk2[shift_g] * alpha[1] + matrix_gk3[shift_g] * alpha[2] + matrix_gk4[shift_g] * alpha[3] + matrix_gk5[shift_g] * alpha[4] + matrix_gk6[shift_g] * alpha[5]; break; default: g += matrix_gk1[shift_g] * matrix_ik1[shift_i] + matrix_gk2[shift_g] * matrix_ik2[shift_i] + matrix_gk3[shift_g] * matrix_ik3[shift_i] + matrix_gk4[shift_g] * matrix_ik4[shift_i] + matrix_gk5[shift_g] * matrix_ik5[shift_i] + matrix_gk6[shift_g] * matrix_ik6[shift_i]; break; } } //--- float mt = b1 * matrix_m[weight_shift] + (1 - b1) * g; float vt = b2 * matrix_v[weight_shift] + (1 - b2) * (g * g); float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); if(fabs(delta) > 0) matrix_w[weight_shift] = matrix_w[weight_shift] + delta; matrix_m[weight_shift] = mt; matrix_v[weight_shift] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TimeDerivative(__global float *qkv, __global float *dqkv, int dimension) { const size_t pos = get_global_id(0); const size_t variable = get_global_id(1); const size_t head = get_global_id(2); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); const size_t heads = get_global_size(2); //--- const int shift = 3 * heads * variables * dimension; const int shift_query = pos * shift + (3 * variable * heads + head) * dimension; const int shift_key = shift_query + heads * dimension; //--- //--- for(int i = 0; i < dimension; i++) { //--- dQ/dt { int count = 0; float delta = 0; float value = qkv[shift_query + i]; if(pos > 0) { delta = value - qkv[shift_query + i - shift]; count++; } if(pos < (total - 1)) { delta += qkv[shift_query + i + shift] - value; count++; } if(count > 0) dqkv[shift_query + i] = delta / count; } //--- dK/dt { int count = 0; float delta = 0; float value = qkv[shift_key + i]; if(pos > 0) { delta = value - qkv[shift_key + i - shift]; count++; } if(pos < (total - 1)) { delta += qkv[shift_key + i + shift] - value; count++; } if(count > 0) dqkv[shift_key + i] = delta / count; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HiddenGradientTimeDerivative(__global float *qkv_g, __global float *dqkv_g, int dimension) { const size_t pos = get_global_id(0); const size_t variable = get_global_id(1); const size_t head = get_global_id(2); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); const size_t heads = get_global_size(2); //--- const int shift = 3 * heads * variables * dimension; const int shift_query = pos * shift + (3 * variable * heads + head) * dimension; const int shift_key = shift_query + heads * dimension; //--- //--- for(int i = 0; i < dimension; i++) { //--- dQ/dt { int count = 0; float grad = 0; float current = dqkv_g[shift_query + i]; if(pos > 0) { grad += current - dqkv_g[shift_query + i - shift]; count++; } if(pos < (total - 1)) { grad += dqkv_g[shift_query + i + shift] - current; count++; } if(count > 0) grad /= count; qkv_g[shift_query + i] += grad; } //--- dK/dt { int count = 0; float grad = 0; float current = dqkv_g[shift_key + i]; if(pos > 0) { grad += current - dqkv_g[shift_key + i - shift]; count++; } if(pos < (total - 1)) { grad += dqkv_g[shift_key + i + shift] - current; count++; } if(count > 0) grad /= count; qkv_g[shift_key + i] += dqkv_g[shift_key + i] + grad; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardContAtt(__global float *qkv, __global float *dqkv, __global float *score, __global float *out, int dimension, int heads) { const size_t query = get_global_id(0); const size_t key = get_global_id(1); const size_t variable = get_global_id(2); const size_t queris = get_global_size(0); const size_t keis = get_global_size(1); const size_t variables = get_global_size(2); //--- const uint ls_score = min((uint)keis, (uint)LOCAL_ARRAY_SIZE); __local float local_score[LOCAL_ARRAY_SIZE]; //--- //--- for(int head = 0; head < heads; head++) { const int shift = 3 * heads * variables * dimension; const int shift_query = query * shift + (3 * variable * heads + head) * dimension; const int shift_key = key * shift + (3 * variable * heads + heads + head) * dimension; const int shift_out = dimension * (heads * (query * variables + variable) + head); int shift_score = keis * (heads * (query * variables + variable) + head) + key; //--- Score float scr = 0; for(int d = 0; d < dimension; d++) scr += qkv[shift_query + d] * dqkv[shift_key + d] + qkv[shift_key + d] * dqkv[shift_query + d]; scr = exp(min(scr / sqrt((float)dimension), 30.0f)); score[shift_score] = scr; BarrierLoc //--- if(key < ls_score) { local_score[key] = scr; for(int k = ls_score + key; k < keis; k += ls_score) local_score[key] += score[shift_score + k]; } BarrierLoc //--- int count = ls_score; do { count = (count + 1) / 2; if(key < count) { if((key + count) < keis) { local_score[key] += local_score[key + count]; local_score[key + count] = 0; } } BarrierLoc } while(count > 1); //--- score[shift_score] /= local_score[0]; BarrierLoc //--- shift_score -= key; for(int d = key; d < dimension; d += keis) { float sum = 0; int shift_value = (3 * variable * heads + 2 * heads + head) * dimension + d; for(int v = 0; v < keis; v++) sum += qkv[shift_value + v * shift] * score[shift_score + v]; out[shift_out + d] = sum; } BarrierLoc } //--- } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HiddenGradientContAtt(__global float *qkv, __global float *qkv_g, __global float *dqkv, __global float *dqkv_g, __global float *score, __global float *out_g, const int dimension) { const size_t pos = get_global_id(0); const size_t variable = get_global_id(1); const size_t head = get_global_id(2); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); const size_t heads = get_global_size(2); //--- Value gradient { const int shift_value = dimension * (heads * (3 * variables * pos + 3 * variable + 2) + head); const int shift_out = dimension * (head + variable * heads); const int shift_score = total * (variable * heads + head); const int step_out = variables * heads * dimension; const int step_score = variables * heads * total; //--- //--- for(int d = 0; d < dimension; d++) { float sum = 0; for(int g = 0; g < total; g++) sum += out_g[shift_out + g * step_out + d] * score[shift_score + g * step_score]; qkv_g[shift_value + d] = sum; } } //--- Query gradient { const int shift_out = dimension * (heads * (pos * variables + variable) + head); const int step = 3 * variables * heads * dimension; const int shift_query = dimension * (3 * heads * variable + head) + pos * step; const int shift_key = dimension * (heads * (3 * variable + 1) + head); const int shift_value = dimension * (heads * (3 * variable + 2) + head); const int shift_score = total * (heads * (pos * variables + variable) + head); //--- Score gradient //--- for(int k = 0; k < total; k++) { float score_grad = 0; float scr = score[shift_score + k]; for(int v = 0; v < total; v++) { float grad = 0; for(int d = 0; d < dimension; d++) grad += qkv[shift_value + v * step + d] * out_g[shift_out + d]; score_grad += score[shift_score + v] * grad * ((float)(pos == v) - scr); } score_grad /= sqrt((float)dimension); //--- Query gradient for(int d = 0; d < dimension; d++) { if(k == 0) { dqkv_g[shift_query + d] = score_grad * qkv[shift_key + k * step + d]; qkv_g[shift_query + d] = score_grad * dqkv[shift_key + k * step + d]; } else { dqkv_g[shift_query + d] += score_grad * qkv[shift_key + k * step + d]; qkv_g[shift_query + d] += score_grad * dqkv[shift_key + k * step + d]; } } } } //--- Key gradient { const int shift_key = dimension * (heads * (3 * variables * pos + 3 * variable + 1) + head); const int shift_out = dimension * (head + variable * heads); const int step_out = variables * heads * dimension; const int step = 3 * variables * heads * dimension; const int shift_query = dimension * (3 * heads * variable + head); const int shift_value = dimension * (heads * (3 * variable + 2) + head) + pos * step; const int shift_score = total * (heads * variable + head); const int step_score = variables * heads * total; //--- Score gradient //--- for(int q = 0; q < total; q++) { float score_grad = 0; float scr = score[shift_score + q * step_score]; for(int g = 0; g < total; g++) { float grad = 0; for(int d = 0; d < dimension; d++) grad += qkv[shift_value + d] * out_g[shift_out + d + g * step_out] / sqrt((float)dimension); score_grad += score[shift_score + q * step_score + g] * grad * ((float)(q == pos) - scr); } //--- Key gradient for(int d = 0; d < dimension; d++) { if(q == 0) { dqkv_g[shift_key + d] = qkv[shift_query + q * step + d] * score_grad; qkv_g[shift_key + d] = score_grad * dqkv[shift_query + q * step + d]; } else { qkv_g[shift_key + d] += score_grad * dqkv[shift_query + q * step + d]; dqkv_g[shift_key + d] += score_grad * qkv[shift_query + q * step + d]; } } } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void RevInFeedForward(__global float *inputs, __global float *options, __global float *output, int options_size, int optimization) { int n = get_global_id(0); int shift = (n * (optimization == 0 ? 7 : 9)) % options_size; //--- float mean = options[shift]; float variance = options[shift + 1]; float k = options[shift + 3]; //--- float res = 0; res = sqrt(variance) * (inputs[n] - options[shift + 4]) / fmax(k, 0.001f) + mean; if(isnan(res)) res = 0; //--- output[n] = res; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void RevInHiddenGraddient(__global float *inputs, __global float *inputs_gr, __global float *options, __global float *output_gr, int options_size, int optimization, int activation) { int n = get_global_id(0); int shift = (n * (optimization == 0 ? 7 : 9)) % options_size; //--- float variance = options[shift + 1]; float inp = inputs[n]; float k = options[shift + 3]; //--- float res = sqrt(variance) * output_gr[n]; if(fabs(k) > 1) res /= k; if(isnan(res)) res = 0; //--- inputs_gr[n] = Deactivation(res, inp, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void Activation(__global const float *inputs, __global float *outputs, const int activation) { int n = get_global_id(0); //--- float res = IsNaNOrInf(inputs[n], 0); //--- outputs[n] = fActivation(res, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DeActivation(__global const float *inputs, __global float *inputs_gr, __global const float *output_gr, const int activation) { int n = get_global_id(0); //--- float inp = inputs[n]; float res = IsNaNOrInf(output_gr[n], 0); //--- inputs_gr[n] = Deactivation(res, inp, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PatchCreate(__global float *inputs, __global float *weights, __global float *outputs, int inputs_total, int window_in, int step, int activation ) { const int i = get_global_id(0); const int w = get_global_id(1); const int v = get_global_id(2); const int window_out = get_global_size(1); const int variables = get_global_size(2); //--- const int shift_in = i * step * variables + v; const int shift_out = (i * variables + v) * window_out + w; const int shift_weights = (window_in + 1) * (v * window_out + w); //--- float res = weights[shift_weights + window_in]; //--- for(int p = 0; p < window_in; p++) if((shift_in + p * variables) < inputs_total) res += inputs[shift_in + p * variables] * weights[shift_weights + p]; if(isnan(res)) res = 0; //--- outputs[shift_out] = fActivation(res, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PatchHiddenGradient(__global float *inputs, __global float *inputs_gr, __global float *weights, __global float *outputs_gr, int window_in, int step, int window_out, int outputs_total, int activation ) { const int i = get_global_id(0); const int v = get_global_id(1); const int variables = get_global_size(1); //--- const int w_start = i % step; const int r_start = max((i - window_in + step) / step, 0); int total = (window_in - w_start + step - 1) / step; total = min((i + step) / step, total); //--- float grad = 0; //--- for(int p = 0; p < total; p ++) { int row = r_start + p; if(row >= outputs_total) break; for(int wo = 0; wo < window_out; wo++) { int shift_g = (row * variables + v) * window_out + wo; int shift_w = v * (window_in + 1) * window_out + w_start + (total - p - 1) * step + wo * (window_in + 1); grad += outputs_gr[shift_g] * weights[shift_w]; } } //--- float inp = inputs[i * variables + v]; //--- inputs_gr[i * variables + v] = Deactivation(grad, inp, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PatchUpdateWeightsAdam(__global float *weights, __global const float *outputs_gr, __global const float *inputs, __global float *weights_m, __global float *weights_v, const int inputs_total, const float l, const float b1, const float b2, int step ) { const int c = get_global_id(0); const int r = get_global_id(1); const int v = get_global_id(2); const int window_in = get_global_size(0) - 1; const int window_out = get_global_size(1); const int variables = get_global_size(2); //--- const int start_input = c * variables + v; const int step_input = step * variables; const int start_out = v * window_out + r; const int step_out = variables * window_out; const int total = inputs_total / (variables * step); //--- float grad = 0; //--- for(int p = 0; p < total; p++) { int i = start_input + i * step_input; int o = start_out + i * step_out; grad += (c == window_in ? 1 : inputs[i]) * outputs_gr[0]; } if(isnan(grad)) grad = 0; //--- const int shift_weights = (window_in + 1) * (window_out * v + r) + c; //--- float weight = weights[shift_weights]; float mt = b1 * weights_m[shift_weights] + (1 - b1) * grad; float vt = b2 * weights_v[shift_weights] + (1 - b2) * (grad * grad); float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); if(fabs(delta) > 0) weights[shift_weights] = weight + delta; weights_m[shift_weights] = mt; weights_v[shift_weights] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MatMult(__global const float *matr1, __global const float *matr2, __global float *result, int dimension, int multvarsecond) { size_t row = get_global_id(0); size_t col = get_global_id(1); size_t var = get_global_id(2); size_t rows = get_global_size(0); size_t cols = get_global_size(1); //--- int shift1 = RCtoFlat(row, 0, rows, dimension, var); int shift2 = RCtoFlat(0, col, dimension, cols, multvarsecond * var); int shift_out = RCtoFlat(row, col, rows, cols, var); //--- float res = 0; //--- for(int i = 0; i < dimension; i++) res += IsNaNOrInf(matr1[shift1 + i] * matr2[shift2 + i * cols], 0); //--- result[shift_out] = res; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MatMultGrad(__global const float *matr1, __global float *matr1_gr, __global const float *matr2, __global float *matr2_gr, __global const float *result_gr, int dimension, int multvarsecond) { size_t row = get_global_id(0); size_t col = get_global_id(1); size_t var = get_global_id(2); size_t rows = get_global_size(0); size_t cols = get_global_size(1); //--- int shift1 = (row + var * rows) * dimension; int shift2 = var * dimension * cols * multvarsecond; int shift_out = (row + var * rows) * cols; //--- for(int c = 0; c < dimension; c += cols) { if((c + col) >= dimension) continue; float grad = 0; for(int i = 0; i < cols; i++) grad += IsNaNOrInf(result_gr[shift_out + i] * matr2[shift2 + c * cols + i], 0); matr1_gr[shift1 + c] = IsNaNOrInf(grad, 0); } //--- shift_out = var * rows * cols + col; //--- for(int r = 0; r < dimension; r += rows) { if((r + row) >= dimension) continue; shift1 = var * rows * dimension + r; float grad = 0; for(int i = 0; i < rows; i++) grad += IsNaNOrInf(result_gr[shift_out + i * cols] * matr1[shift1 + i * dimension], 0); matr2_gr[shift2 + col + r * cols] = IsNaNOrInf(grad, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FFT(__global float *inputs_re, __global float *inputs_im, __global float *outputs_re, __global float *outputs_im, const int input_window, const int input_complex, const int output_window, const int reverse ) { size_t variable = get_global_id(0); //--- const ulong N = output_window; const ulong N2 = N / 2; const ulong inp_shift = input_window * variable; const ulong out_shift = output_window * variable; //--- uint target = 0; //--- for(uint position = 0; position < N; position++) { if(target > position) { outputs_re[out_shift + position] = (target < input_window ? inputs_re[inp_shift + target] : 0); outputs_im[out_shift + position] = ((target < input_window && input_complex) ? inputs_im[inp_shift + target] : 0); outputs_re[out_shift + target] = inputs_re[inp_shift + position]; outputs_im[out_shift + target] = (input_complex ? inputs_im[inp_shift + position] : 0); } else { outputs_re[out_shift + position] = inputs_re[inp_shift + position]; outputs_im[out_shift + position] = (input_complex ? inputs_im[inp_shift + position] : 0); } unsigned int mask = N; while(target & (mask >>= 1)) target &= ~mask; target |= mask; } float real = 0, imag = 0; //--- for(int len = 2; len <= (int)N; len <<= 1) { float w_real = (float)cos(2 * M_PI_F / len); float w_imag = (float)sin(2 * M_PI_F / len); for(int i = 0; i < (int)N; i += len) { float cur_w_real = 1; float cur_w_imag = 0; for(int j = 0; j < len / 2; j++) { real = cur_w_real * outputs_re[out_shift + i + j + len / 2] - cur_w_imag * outputs_im[out_shift + i + j + len / 2]; imag = cur_w_imag * outputs_re[out_shift + i + j + len / 2] + cur_w_real * outputs_im[out_shift + i + j + len / 2]; outputs_re[out_shift + i + j + len / 2] = outputs_re[out_shift + i + j] - real; outputs_im[out_shift + i + j + len / 2] = outputs_im[out_shift + i + j] - imag; outputs_re[out_shift + i + j] += real; outputs_im[out_shift + i + j] += imag; real = cur_w_real * w_real - cur_w_imag * w_imag; cur_w_imag = cur_w_imag * w_real + cur_w_real * w_imag; cur_w_real = real; } } } //--- if(reverse) { outputs_re[0] /= N; outputs_im[0] /= N; outputs_re[N2] /= N; outputs_im[N2] /= N; //--- for(int i = 1; i < N2; i++) { real = outputs_re[i] / N; imag = outputs_im[i] / N; outputs_re[i] = outputs_re[N - i] / N; outputs_im[i] = outputs_im[N - i] / N; outputs_re[N - i] = real; outputs_im[N - i] = imag; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexLayer(__global float *inputs_re, __global float *inputs_im, __global float *outputs_re, __global float *outputs_im ) { size_t i = get_global_id(0); size_t j = get_global_id(1); size_t total_i = get_global_size(0); size_t total_j = get_global_size(1); uint shift = i * total_j + j; //--- outputs_re[shift] = inputs_re[shift] - inputs_im[shift]; outputs_im[shift] = inputs_im[shift] + inputs_re[shift]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexLayerGradient(__global float *inputs_re, __global float *inputs_im, __global float *outputs_re, __global float *outputs_im ) { size_t i = get_global_id(0); size_t j = get_global_id(1); size_t total_i = get_global_size(0); size_t total_j = get_global_size(1); uint shift = i * total_j + j; //--- inputs_re[shift] = outputs_re[shift] + outputs_im[shift]; inputs_im[shift] = outputs_im[shift] - outputs_re[shift]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GradientMSA(__global float *matrix_t, ///<[in] Target tensor __global float *matrix_o, ///<[in] Forecast tensor __global float *matrix_g ///<[out] Tensor of gradients ) { int i = get_global_id(0); matrix_g[i] = matrix_t[i] - matrix_o[i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CumulativeGradient(__global float *gradient1, __global float *gradient2, __global float *gradient_out, float alpha ) { int i = get_global_id(0); gradient_out[i] = alpha * gradient1[i] + (1 - alpha) * gradient2[i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 ComplexMul(const float2 a, const float2 b) { float2 result = 0; result.x = IsNaNOrInf(a.x * b.x - a.y * b.y, 0); result.y = IsNaNOrInf(a.x * b.y + a.y * b.x, 0); return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 ComplexDiv(const float2 a, const float2 b) { float2 result = 0; float z = IsNaNOrInf(b.x * b.x + b.y * b.y, 1); if(z > 0) { result.x = IsNaNOrInf(a.x * b.x + a.y * b.y, 0) / z; result.y = IsNaNOrInf(a.y * b.x - a.x * b.y, 0) / z; } return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float ComplexAbs(float2 a) { return sqrt(IsNaNOrInf(a.x * a.x + a.y * a.y, 0)); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 ComplexSqrt(float2 a) { float2 result = 0; float z = ComplexAbs(a); result.x = sqrt((z + IsNaNOrInf(a.x, 0)) / 2); result.y = sqrt((z - IsNaNOrInf(a.x, 0)) / 2); if(a.y < 0) result.y *= (-1); //--- return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 ComplexExp(float2 a) { float2 result = exp(clamp(IsNaNOrInf(a.x, 0), -20.0f, 20.0f)); result.x *= IsNaNOrInf(cos(a.y), 0); result.y *= IsNaNOrInf(sin(a.y), 0); return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ inline float2 ComplexTanh(float2 a) { float sinh_re = sinh(a.x); float cosh_re = cosh(a.x); float sin_im = sin(a.y); float cos_im = cos(a.y); //--- float2 sinh_a = 0; float2 cosh_a = 0; sinh_a.x = sinh_re * cos_im; sinh_a.y = cosh_re * sin_im; cosh_a.x = cosh_re * cos_im; cosh_a.y = sinh_re * sin_im; //--- return ComplexDiv(sinh_a, cosh_a); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardComplexConv(__global const float2* __attribute__((aligned(8))) matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global const float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor __global float2* __attribute__((aligned(8))) matrix_o, ///<[out] Output tensor const int inputs, ///< Number of inputs const int step, ///< Step size const int window_in, ///< Size of input window const int activation ///< Activation type (#ENUM_ACTIVATION) ) { const size_t i = get_global_id(0); const size_t units = get_global_size(0); const size_t out = get_global_id(1); const size_t w_out = get_global_size(1); const size_t var = get_global_id(2); const size_t variables = get_global_size(2); //--- int w_in = window_in; int shift_out = w_out * (i + units * var); int shift_in = step * i; int shift = (w_in + 1) * (out + var * w_out); int stop = (w_in <= (inputs - shift_in) ? w_in : (inputs - shift_in)); shift_in += + inputs * var; //--- float2 sum = ComplexMul((float2)(1, 0), matrix_w[shift + w_in]); //--- for(int k = 0; k < stop; k ++) sum += IsNaNOrInf2(ComplexMul(matrix_i[shift_in + k], matrix_w[shift + k]), (float2)0); //--- switch(activation) { case 0: sum = ComplexTanh(sum); break; case 1: sum = ComplexDiv((float2)(1, 0), (float2)(1, 0) + ComplexExp(-sum)); break; case 2: if(sum.x < 0) { sum.x *= 0.01f; sum.y *= 0.01f; } break; default: break; } matrix_o[out + shift_out] = sum; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientComplexConv(__global const float2* __attribute__((aligned(8))) matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input ///< window and n - output window __global const float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer __global const float2* __attribute__((aligned(8))) matrix_o, ///<[in] Output tensor __global float2* __attribute__((aligned(8))) matrix_ig, ///<[out] Tensor of gradients at previous layer const int outputs, ///< Number of outputs const int step, ///< Step size const int window_in, ///< Size of input window const int window_out, ///< Size of output window const int activation, ///< Activation type (#ENUM_ACTIVATION) const int shift_out ///< Shift in output and gradient buffer ) { const size_t i = get_global_id(0); const size_t inputs = get_global_size(0); const size_t var = get_global_id(1); const size_t variables = get_global_size(1); //--- float2 sum = (float2)0; float2 out = matrix_o[i]; int start = i - window_in + step; start = max((start - start % step) / step, 0) + var * inputs; int stop = (i + step - 1) / step; if(stop > (outputs / window_out)) stop = outputs / window_out; stop += var * outputs; //--- for(int h = 0; h < window_out; h ++) { for(int k = start; k < stop; k++) { int shift_g = k * window_out + h; int shift_w = (stop - k - 1) * step + i % step + h * (window_in + 1); if(shift_g >= outputs || shift_w >= (window_in + 1) * window_out) break; sum += ComplexMul(matrix_g[shift_out + shift_g], matrix_w[shift_w]); } } sum = IsNaNOrInf2(sum, (float2)0); //--- switch(activation) { case 0: sum = ComplexMul(sum, (float2)1.0f - ComplexMul(out, out)); break; case 1: sum = ComplexMul(sum, ComplexMul(out, (float2)1.0f - out)); break; case 2: if(out.x < 0.0f) { sum.x *= 0.01f; sum.y *= 0.01f; } break; default: break; } matrix_ig[i] = clamp(sum, (float2)(-MAX_GRAD), (float2)MAX_GRAD); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsComplexConvMomentum(__global float2* __attribute__((aligned(8))) matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer __global float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor __global float2* __attribute__((aligned(8))) matrix_dw, ///<[in,out] Matrix of delta weights in last correction int inputs, ///< Number of inputs float learning_rates, ///< Learning rates float momentum, ///< Momentum multiplier int window_in, ///< Size of input window int window_out, ///< Size of output window int step ///< Step size ) { const size_t i = get_global_id(0); const size_t total_w = get_global_size(0); const size_t var = get_global_id(1); const size_t variables = get_global_size(1); const int shift = i % (window_in + 1); int shift_out = (i - shift) / (window_in + 1); int total = (inputs - window_in) % step; total = (inputs - window_in - total) / step + (total > 0 ? 1 : 0); shift_out += total * window_out * var; float2 grad = 0; //--- for(int t = 0; t < total; t++) { if(shift != window_in && (shift + t * window_in) >= inputs) break; grad += ComplexMul(matrix_g[t * window_out + shift_out], (shift == window_in ? (float2)(1, 0) : matrix_i[inputs * var + shift + t * step])); } float2 delta = ComplexMul((float2)(learning_rates, 0), clamp(grad, (float2) - MAX_GRAD, (float2)MAX_GRAD)) + ComplexMul((float2)(momentum, 0), matrix_dw[i + total_w * var]); if(!(isnan(delta.x) || isnan(delta.y) || isinf(delta.x) || isinf(delta.y))) { matrix_dw[i + total_w * var] = delta; matrix_w[i + total_w * var] = matrix_w[i + total_w * var] + delta; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsComplexConvAdam(__global float2* __attribute__((aligned(8))) matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global const float2* __attribute__((aligned(8))) matrix_g, ///<[in] Tensor of gradients at current layer __global const float2* __attribute__((aligned(8))) matrix_i, ///<[in] Inputs tensor __global float2* __attribute__((aligned(8))) matrix_m, ///<[in] Matrix of first momentum __global float2* __attribute__((aligned(8))) matrix_v, ///<[in] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2, ///< Second momentum multiplier int window_in, ///< Size of input window int window_out, ///< Size of output window int step ///< Step size ) { const size_t i = get_global_id(0); const size_t total_w = get_global_size(0); const size_t var = get_global_id(1); const size_t variables = get_global_size(1); //--- const int shift = i % (window_in + 1); int shift_out = (i - shift) / (window_in + 1); int total = (inputs - window_in + step - 1) / step; shift_out += total * window_out * var; const int shift_var_in = var * inputs; const int shift_var_out = var * total * window_out; //--- float2 grad = 0; //--- for(int t = 0; t < total; t++) { if(shift != window_in && (shift + t * window_in) >= inputs) break; grad += IsNaNOrInf2(ComplexMul(matrix_g[t * window_out + shift_out + shift_var_out], (shift == window_in ? (float2)(1, 0) : matrix_i[shift + t * step + shift_var_in])), (float2)0); } grad = clamp(grad, (float2) - MAX_GRAD, (float2)MAX_GRAD); float2 mt = IsNaNOrInf2(b1 * matrix_m[i + total_w * var] + (1 - b1) * grad, (float2)0); float2 vt = IsNaNOrInf2(b2 * matrix_v[i + total_w * var] + (1 - b2) * ComplexMul(grad, grad), (float2)(1.0e-6f, 0)); float2 weight = matrix_w[i + total_w * var] + IsNaNOrInf2(l * ComplexDiv(mt, ComplexSqrt(vt)), (float2)0); matrix_w[i + total_w * var] = weight; matrix_m[i + total_w * var] = mt; matrix_v[i + total_w * var] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexSoftMax_FeedForward(__global float2* __attribute__((aligned(8))) inputs, __global float2* __attribute__((aligned(8))) outputs, const int total) { const uint i = (uint)get_global_id(0); const uint l = (uint)get_local_id(0); const uint h = (uint)get_global_id(1); const uint ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE); uint shift_head = h * total; //--- __local float2 temp[LOCAL_ARRAY_SIZE]; uint count = 0; if(l < ls) do { uint shift = shift_head + count * ls + l; if(shift < ((h + 1) * total)) temp[l].x = (count > 0 ? fmax(ComplexAbs(inputs[shift]), temp[l].x) : ComplexAbs(inputs[shift])); count++; } while((count * ls + l) < total); BarrierLoc float max_value = temp[0].x; //--- for(int i = 1; i < ls; i++) max_value = fmax(max_value, temp[i].x); //--- count = 0; if(l < ls) do { uint shift = shift_head + count * ls + l; temp[l] = (count > 0 ? temp[l] : (float2)0) + (shift < ((h + 1) * total) ? ComplexExp(ComplexDiv(inputs[shift], (float2)(max_value, 0))) : (float2)0); count++; } while((count * ls + l) < total); BarrierLoc count = min(ls, (uint)total); do { count = (count + 1) / 2; if(l < ls) temp[l] += (l < count && (l + count) < total ? temp[l + count] : (float2)0); if(l + count < ls) temp[l + count] = (float2)0; BarrierLoc } while(count > 1); //--- float2 sum = temp[0]; if(ComplexAbs(sum) > 0) { count = 0; while((count * ls + l) < total) { uint shift = shift_head + count * ls + l; if(shift < ((h + 1) * total)) outputs[shift] = ComplexDiv(ComplexExp(ComplexDiv(inputs[shift], (float2)(max_value, 0))), sum); count++; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexSoftMax_HiddenGradient(__global float2* __attribute__((aligned(8))) outputs, __global float2* __attribute__((aligned(8))) output_gr, __global float2* __attribute__((aligned(8))) input_gr) { size_t i = get_global_id(0); size_t outputs_total = get_global_size(0); size_t h = get_global_id(1); uint shift = h * outputs_total; float2 output = outputs[shift + i]; float2 result = 0; //--- for(int j = 0; j < outputs_total; j++) result += ComplexMul(ComplexMul(outputs[shift + j], output_gr[shift + j]), ((i == j ? (float2)(1, 0) : (float2)0) - output)); input_gr[shift + i] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexSoftMax_OutputGradient(__global float2* __attribute__((aligned(8))) outputs, __global float2* __attribute__((aligned(8))) targets, __global float2* __attribute__((aligned(8))) output_gr) { size_t i = get_global_id(0); if(ComplexAbs(outputs[i]) == 0) output_gr[i] = (float2)0; else output_gr[i] = ComplexDiv(targets[i], outputs[i]); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexMHAttentionScore(__global float2* __attribute__((aligned(8))) qkv, ///<[in] Matrix of Querys, Keys, Values __global float2* __attribute__((aligned(8))) score, ///<[out] Matrix of Scores int dimension, ///< Dimension of Key int mask ///< 1 - calc only previous units, 0 - calc all ) { int q = get_global_id(0); int h = get_global_id(1); int units = get_global_size(0); int heads = get_global_size(1); //--- int shift_q = dimension * (h + 3 * q * heads); int shift_s = units * (h + q * heads); //--- float2 koef = (float2)(sqrt((float)dimension), 0); if(koef.x < 1) koef.x = 1; float2 sum = 0; //--- for(int k = 0; k < units; k++) { if(mask > 0 && k > q) { score[shift_s + k] = (float2)0; continue; } float2 result = (float2)0; int shift_k = dimension * (h + heads * (3 * k + 1)); for(int i = 0; i < dimension; i++) result += ComplexMul(qkv[shift_q + i], qkv[shift_k + i]); result = ComplexExp(ComplexDiv(result, koef)); if(isnan(result.x) || isnan(result.y) || isinf(result.x) || isinf(result.y)) result = (float2)0; score[shift_s + k] = result; sum += result; } if(ComplexAbs(sum) > 0) { //--- for(int k = 0; k < units; k++) score[shift_s + k] = ComplexDiv(score[shift_s + k], sum); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexMHAttentionOut(__global float2* __attribute__((aligned(8))) scores, ///<[in] Matrix of Scores __global float2* __attribute__((aligned(8))) qkv, ///<[in] Matrix of Values __global float2* __attribute__((aligned(8))) out, ///<[out] Output tensor int dimension ///< Dimension of Value ) { int u = get_global_id(0); int units = get_global_size(0); int h = get_global_id(1); int heads = get_global_size(1); //--- int shift_s = units * (h + heads * u); int shift_out = dimension * (h + heads * u); //--- //--- for(int d = 0; d < dimension; d++) { float2 result = (float2)0; for(int v = 0; v < units; v++) { int shift_v = dimension * (h + heads * (3 * v + 2)) + d; result += ComplexMul(scores[shift_s + v], qkv[shift_v]); } out[shift_out + d] = result; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexMHAttentionGradients(__global float2* __attribute__((aligned(8))) qkv, __global float2* __attribute__((aligned(8))) qkv_g, __global float2* __attribute__((aligned(8))) scores, __global float2* __attribute__((aligned(8))) gradient ) { size_t u = get_global_id(0); size_t h = get_global_id(1); size_t d = get_global_id(2); size_t units = get_global_size(0); size_t heads = get_global_size(1); size_t dimension = get_global_size(2); //--- float2 koef = (float2)(sqrt((float)dimension), 0); if(koef.x < 1) koef.x = 1; //--- init const int shift_q = dimension * (heads * 3 * u + h); const int shift_k = dimension * (heads * (3 * u + 1) + h); const int shift_v = dimension * (heads * (3 * u + 2) + h); const int shift_g = dimension * (heads * u + h); int shift_score = h * units; int step_score = units * heads; //--- Calculating Value's gradients float2 sum = (float2)0; //--- for(int i = 0; i < units; i++) sum += ComplexMul(gradient[(h + i * heads) * dimension + d], scores[shift_score + u + i * step_score]); qkv_g[shift_v + d] = sum; //--- Calculating Query's gradients shift_score = h * units + u * step_score; float2 grad = 0; float2 grad_out = gradient[shift_g + d]; //--- for(int k = 0; k < units; k++) { float2 sc_g = (float2)0; float2 sc = scores[shift_score + k]; for(int v = 0; v < units; v++) sc_g += ComplexMul( ComplexMul(scores[shift_score + v], ComplexMul(qkv[dimension * (heads * (3 * v + 2) + h)], grad_out)), ((float2)(k == v, 0) - sc) ); grad += ComplexMul(ComplexDiv(sc_g, koef), qkv[dimension * (heads * (3 * k + 1) + h) + d]); } qkv_g[shift_q + d] = grad; //--- Calculating Key's gradients grad = 0; //--- for(int q = 0; q < units; q++) { shift_score = h * units + q * step_score; float2 sc_g = (float2)0; float2 sc = scores[shift_score + u]; float2 grad_out = gradient[dimension * (heads * q + h) + d]; for(int v = 0; v < units; v++) sc_g += ComplexMul( ComplexMul(scores[shift_score + v], ComplexMul(qkv[dimension * (heads * (3 * v + 2) + h)], grad_out)), ((float2)(u == v, 0) - sc) ); grad += ComplexMul(ComplexDiv(sc_g, koef), qkv[dimension * (heads * 3 * q + h) + d]); } qkv_g[shift_k + d] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexNormalize(__global float2* __attribute__((aligned(8))) inputs, __global float2* __attribute__((aligned(8))) outputs, __global float2* __attribute__((aligned(8))) means, __global float *vars, int dimension) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; const float2 dim = (float2)(dimension, 0); //--- float2 mean = 0; //--- for(int i = 0; i < dimension; i++) mean = IsNaNOrInf2(inputs[shift + i], (float2)0); means[n] = mean = ComplexDiv(mean, dim); float variance = 0; //--- for(int i = 0; i < dimension; i++) { float abs_delta = ComplexAbs(inputs[shift + i] - mean); variance += abs_delta * abs_delta; } vars[n] = variance = sqrt(IsNaNOrInf(variance / dimension, 1)); float2 v = (float2)(variance, 0); //--- for(int i = 0; i < dimension; i++) { float2 val = IsNaNOrInf2(ComplexDiv((inputs[shift + i] - mean), v), (float2)0); outputs[shift + i] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexNormalizeGradient(__global float2* __attribute__((aligned(8))) inputs_gr, __global float2* __attribute__((aligned(8))) outputs_gr, __global float *vars, int dimension) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float v = vars[n]; float2 variance = (float2)((v > 0 ? v : 1.0f), 0); //--- for(int i = 0; i < dimension; i++) { float2 val = ComplexDiv(outputs_gr[shift + i], variance); if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y)) val = (float2)0; inputs_gr[shift + i] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexUnNormalize(__global float2* __attribute__((aligned(8))) inputs, __global float2* __attribute__((aligned(8))) outputs, __global float2* __attribute__((aligned(8))) means, __global float *vars, int dimension) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float v = vars[n]; float2 variance = (float2)((v > 0 ? v : 1.0f), 0); float2 mean = means[n]; //--- for(int i = 0; i < dimension; i++) { float2 val = ComplexMul(inputs[shift + i], variance) + mean; if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y)) val = (float2)0; outputs[shift + i] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ComplexUnNormalizeGradient(__global float2* __attribute__((aligned(8))) inputs_gr, __global float2* __attribute__((aligned(8))) outputs_gr, __global float *vars, int dimension ) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float v = vars[n]; float2 variance = (float2)((v > 0 ? v : 1.0f), 0); //--- for(int i = 0; i < dimension; i++) { float2 val = ComplexMul(outputs_gr[shift + i], variance); if(isnan(val.x) || isinf(val.x) || isnan(val.y) || isinf(val.y)) val = (float2)0; inputs_gr[shift + i] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MainFreqWeight(__global float2* __attribute__((aligned(8))) freq, __global float *weight, int dimension ) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float max_f = 0; float total = 0; float energy; //--- for(int i = 0; i < dimension; i++) { energy = ComplexAbs(freq[shift + i]); total += energy; max_f = fmax(max_f, energy); } weight[n] = max_f / (total > 0 ? total : 1); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void WeightedSum(__global float *inputs1, __global float *inputs2, __global float *outputs, __global float *weight, int dimension ) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float w = weight[n]; //--- for(int i = 0; i < dimension; i++) outputs[shift + i] = inputs1[shift + i] * w + inputs2[shift + i] * (1 - w); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void WeightedSumGradient(__global float *inputs_gr1, __global float *inputs_gr2, __global float *outputs_gr, __global float *weight, int dimension ) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float w = weight[n]; float w1 = 1 - weight[n]; //--- for(int i = 0; i < dimension; i++) { float grad = outputs_gr[shift + i]; inputs_gr1[shift + i] = grad * w; inputs_gr2[shift + i] = grad * w1; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardS3(__global float *inputs, __global float *probability, __global float *weights, __global float *outputs, __global float *positions, const int window, const int total ) { int pos = get_global_id(0); int segments = get_global_size(0); //--- if((segments * window) > total) segments--; //--- int segment = 0; if(pos < segments) { const float prob = probability[pos]; //--- for(int i = 0; i < pos; i++) { if(probability[i] <= prob) segment++; } //--- for(int i = pos + 1; i < segments; i++) { if(probability[i] < prob) segment++; } } else segment = pos; //--- const int shift_in = segment * window; const int shift_out = pos * window; const float w1 = weights[0]; const float w2 = weights[1]; positions[pos] = (float)segment; //--- for(int i = 0; i < window; i++) { if((shift_in + i) >= total || (shift_out + i) >= total) break; outputs[shift_out + i] = w1 * inputs[shift_in + i] + w2 * inputs[shift_out + i]; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void InsideGradientS3(__global float *inputs, __global float *inputs_gr, __global float *probability, __global float *probability_gr, __global float *weights, __global float *outputs_gr, __global float *positions, const int window, const int total ) { size_t pos = get_global_id(0); //--- int segment = (int)positions[pos]; float prob = probability[pos]; const float w1 = weights[0]; const float w2 = weights[1]; const int shift_in = segment * window; const int shift_out = pos * window; //--- float grad = 0; float temp = 0; //--- for(int i = 0; i < window; i++) { if((shift_out + i) >= total) break; temp = outputs_gr[shift_out + i] * w1; grad += temp * inputs[shift_in + i]; inputs_gr[shift_in + i] = temp + outputs_gr[shift_in + i] * w2; } probability_gr[segment] = grad / prob; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void WeightGradientS3(__global float *inputs, __global float *positions, __global float *outputs_gr, __global float *weights_gr, const int window, const int total ) { size_t l = get_local_id(0); size_t w = get_global_id(1); size_t ls = min((uint)get_local_size(0), (uint)LOCAL_ARRAY_SIZE); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- if(l < ls) { float val = 0; //--- for(int i = l; i < total; i += ls) { int shift_in = i; if(w == 0) { int pos = i / window; shift_in = positions[pos] * window + i % window; } val += outputs_gr[i] * inputs[shift_in]; } temp[l] = val; } BarrierLoc //--- int t = ls; do { t = (t + 1) / 2; if(l < t && (l + t) < ls) { temp[l] += temp[l + t]; temp[l + t] = 0; } BarrierLoc } while(t > 1); //--- if(l == 0) weights_gr[w] = temp[0]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MH2PyrAttentionOut(__global float *q, __global float *kv, __global float *score, __global float *out, const int dimension, const int heads_kv, const int window ) { //--- init const int q_id = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_k = dimension * (2 * heads_kv * k + h_kv); const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv); const int shift_s = kunits * (q_id * heads + h) + k; const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); const int delta_win = (window + 1) / 2; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; __local float temp[LOCAL_ARRAY_SIZE]; //--- Score float sum = 0; for(int d = 0; d < dimension; d++) sum = q[shift_q + d] * kv[shift_k + d]; float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp); score[shift_s] = sc; BarrierLoc //--- out for(int d = 0; d < dimension; d++) { uint count = 0; if(k < ls) do { if((count * ls) < (kunits - k)) { sum = 0; if(abs(count * ls + k - q_id) <= delta_win) { int sh_v = 2 * dimension * heads_kv * count * ls; sum = kv[shift_v + d + sh_v] * (count == 0 ? sc : score[shift_s + count * ls]); if(isnan(sum)) sum = 0; } temp[k] = (count > 0 ? temp[k] : 0) + sum; } count++; } while((count * ls + k) < kunits); BarrierLoc //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k < ls) temp[k] += (k < count && (k + count) < kunits ? temp[k + count] : 0); if(k + count < ls) temp[k + count] = 0; BarrierLoc } while(count > 1); //--- if(k == 0) out[shift_q + d] = temp[0]; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PLR(__global const float *inputs, __global float *outputs, __global int *isttp, const int transpose, const float min_step ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- constants const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i)); const int step_in = ((bool)transpose ? variables : 1); //--- look for ttp float value = inputs[shift_in]; bool bttp = false; if(i == 0 || i == lenth - 1) bttp = true; else { float prev = value; int prev_pos = i; float max_v = value; float max_pos = i; float min_v = value; float min_pos = i; while(fmax(fabs(prev - max_v), fabs(prev - min_v)) < min_step && prev_pos > 0) { prev_pos--; prev = inputs[shift_in - (i - prev_pos) * step_in]; if(prev >= max_v && (prev - min_v) < min_step) { max_v = prev; max_pos = prev_pos; } if(prev <= min_v && (max_v - prev) < min_step) { min_v = prev; min_pos = prev_pos; } } //--- float next = value; int next_pos = i; while(fmax(fabs(next - max_v), fabs(next - min_v)) < min_step && next_pos < (lenth - 1)) { next_pos++; next = inputs[shift_in + (next_pos - i) * step_in]; if(next > max_v && (next - min_v) < min_step) { max_v = next; max_pos = next_pos; } if(next < min_v && (max_v - next) < min_step) { min_v = next; min_pos = next_pos; } } //--- if( (value >= prev && value > next) || (value > prev && value == next) || (value <= prev && value < next) || (value < prev && value == next) ) if(max_pos == i || min_pos == i) bttp = true; } //--- isttp[shift_in] = (int)bttp; outputs[shift_in] = 0; BarrierLoc //--- calc position int pos = -1; int prev_in = 0; int prev_ttp = 0; if(bttp) { pos = 0; //--- for(int p = 0; p < i; p++) { int current_in = ((bool)transpose ? (p * variables + v) : (v * lenth + p)); if((bool)isttp[current_in]) { pos++; prev_ttp = p; prev_in = current_in; } } } //--- cacl tendency if(pos > 0 && pos < (lenth / 3)) { float sum_x = 0; float sum_y = 0; float sum_xy = 0; float sum_xx = 0; int dist = i - prev_ttp; //--- for(int p = 0; p < dist; p++) { float x = (float)(p); float y = inputs[prev_in + p * step_in]; sum_x += x; sum_y += y; sum_xy += x * y; sum_xx += x * x; } float slope = (dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1); float intercept = (sum_y - slope * sum_x) / dist; int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)); outputs[shift_out] = slope; outputs[shift_out + step_in] = intercept; outputs[shift_out + 2 * step_in] = ((float)dist) / lenth; } else { if(pos == (lenth / 3)) { float sum_x = 0; float sum_y = 0; float sum_xy = 0; float sum_xx = 0; int dist = lenth - prev_ttp; //--- for(int p = 0; p < dist; p++) { float x = (float)(p); float y = inputs[prev_in + p * step_in]; sum_x += x; sum_y += y; sum_xy += x * y; sum_xx += x * x; } float slope = (dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1); float intercept = (sum_y - slope * sum_x) / dist; int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)); outputs[shift_out] = slope; outputs[shift_out + step_in] = intercept; outputs[shift_out + 2 * step_in] = ((float)dist) / lenth; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PLRGradient(__global float *inputs_gr, __global const float *outputs, __global const float *outputs_gr, const int transpose ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- constants const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i)); const int step_in = ((bool)transpose ? variables : 1); const int shift_out = ((bool)transpose ? v : (v * lenth)); const int step_out = 3 * step_in; //--- calc position int pos = -1; int prev_in = 0; int dist = 0; do { pos++; prev_in += dist; dist = (int)fmax(outputs[shift_out + pos * step_out + 2 * step_in] * lenth, 1); } while(!(prev_in <= i && (prev_in + dist) > i)); //--- calc constants float sum_x = 0; float sum_xx = 0; //--- for(int p = 0; p < dist; p++) { float x = (float)(p); sum_x += x; sum_xx += x * x; } //--- get output gradient float grad_slope = outputs_gr[shift_out + pos * step_out]; float grad_intercept = outputs_gr[shift_out + pos * step_out + step_in]; //--- calc gradient grad_slope -= sum_x / dist * grad_intercept; grad_slope /= fmax(dist * sum_xx - sum_x * sum_x, 1); float grad = grad_intercept / dist; grad += (dist * (i - prev_in) - sum_x) * grad_slope; if(isnan(grad) || isinf(grad)) grad = 0; //--- save result inputs_gr[shift_in] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsAdamMini(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< number of neurons in previous layer and n - ///< number of neurons in current layer __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in,out] Matrix of first momentum __global float *matrix_v, ///<[in,out] Matrix of seconfd momentum const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2 ///< Second momentum multiplier ) { //--- inputs const size_t i = get_local_id(0); const size_t inputs = get_local_size(0) - 1; //--- outputs const size_t o = get_global_id(1); const size_t outputs = get_global_size(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((uint)LOCAL_ARRAY_SIZE, (uint)inputs); const float inp = (i < inputs ? matrix_i[i] : 1.0f); int count = 0; do { if(count == (i / ls)) { int shift = i % ls; temp[shift] = (count == 0 ? 0 : temp[shift]) + ((isnan(inp) || isinf(inp)) ? 0 : inp * inp) / inputs; } count++; BarrierLoc } while(count * ls < inputs); //--- sum count = (ls + 1) / 2; do { if(i < count && (i + count) < ls) { temp[i] += temp[i + count]; temp[i + count] = 0; } count = (count + 1) / 2; BarrierLoc } while(count > 1); //--- calc v if(i == 0) { temp[1] = matrix_g[o]; if(isnan(temp[1]) || isinf(temp[1])) temp[1] = 0; if(isnan(temp[0]) || isinf(temp[0])) temp[0] = 1; float v = matrix_v[o]; if(isnan(v) || isinf(v)) v = 1; temp[0] = b2 * v + (1 - b2) * (temp[1] * temp[1]) * temp[0]; matrix_v[o] = temp[0]; } BarrierLoc //--- const int wi = o * (inputs + 1) + i; float weight = matrix_w[wi]; if(isnan(weight) || isinf(weight)) weight = 0; //--- float m = matrix_m[wi]; if(isnan(m) || isinf(m)) m = 0; //--- calc m m = b1 * m + (1 - b1) * temp[1] * inp; if(isnan(m) || isinf(m)) m = 0; //--- float delta = l * (m / (sqrt(temp[0]) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight)); if(isnan(delta) || isinf(delta)) delta = 0; if(delta > 0) matrix_w[wi] = weight + delta; matrix_m[wi] = m; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsConvAdamMini(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in] Matrix of first momentum __global float *matrix_v, ///<[in] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2, ///< Second momentum multiplier int step ///< Step size ) { //--- window in const size_t i = get_global_id(0); const size_t window_in = get_global_size(0) - 1; //--- window out const size_t f = get_global_id(1); const size_t window_out = get_global_size(1); //--- head window out const size_t f_h = get_local_id(1); const size_t window_out_h = get_local_size(1); //--- variable const size_t v = get_global_id(2); const size_t variables = get_global_size(2); //--- constants const int total = (inputs - window_in + step - 1) / step; const int shift_var_in = v * inputs; const int shift_var_out = v * total * window_out; const int shift_w = (f + v * window_out) * (window_in + 1) + i; //--- __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((uint)window_in, (uint)LOCAL_ARRAY_SIZE); //--- calc gradient float grad = 0; //--- for(int t = 0; t < total; t++) { if(i != window_in && (i + t * window_in) >= inputs) break; float gt = matrix_g[t * window_out + f + shift_var_out] * (i == window_in ? 1 : matrix_i[i + t * step + shift_var_in]); if(!(isnan(gt) || isinf(gt))) grad += gt; } //--- calc sum grad int count; //--- for(int h = 0; h < window_out_h; h++) { count = 0; do { if(h == f_h) { if(count == (i / ls)) { int shift = i % ls; temp[shift] = ((count == 0 && h == 0) ? 0 : temp[shift]) + ((isnan(grad) || isinf(grad)) ? 0 : grad * grad) / (window_in * window_out_h); } } count++; BarrierLoc } while((count * ls) < window_in); } count = (ls + 1) / 2; do { if(i < count && (i + count) < ls && f_h == 0) { temp[i] += temp[i + count]; temp[i + count] = 0; } count = (count + 1) / 2; BarrierLoc } while(count > 1); //--- calc v if(i == 0 && f_h == 0) { if(isnan(temp[0]) || isinf(temp[0])) temp[0] = 1; int head = f / window_out_h; float v = matrix_v[head]; if(isnan(v) || isinf(v)) v = 1; temp[0] = clamp(b2 * v + (1 - b2) * temp[0], 1.0e-6f, 1.0e6f); matrix_v[head] = temp[0]; } BarrierLoc //--- calc m float mt = clamp(b1 * matrix_m[shift_w] + (1 - b1) * grad, -1.0e5f, 1.0e5f); if(isnan(mt) || isinf(mt)) mt = 0; float weight = matrix_w[shift_w] + l * mt / sqrt(temp[0]); if(!(isnan(weight) || isinf(weight))) matrix_w[shift_w] = weight; matrix_m[shift_w] = mt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CutTrendAndOther(__global const float *inputs, __global const float *plr, __global float *trend, __global float *other ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- constants const int shift_in = i * variables + v; const int step_in = variables; const int shift_plr = v; const int step_plr = 3 * step_in; //--- calc position int pos = -1; int prev_in = 0; int dist = 0; do { pos++; prev_in += dist; dist = (int)fmax(plr[shift_plr + pos * step_plr + 2 * step_in] * lenth, 1); } while(!(prev_in <= i && (prev_in + dist) > i)); //--- calc trend float sloat = plr[shift_plr + pos * step_plr]; float intercept = plr[shift_plr + pos * step_plr + step_in]; pos = i - prev_in; float trend_i = sloat * pos + intercept; float other_i = inputs[shift_in] - trend_i; //--- save result trend[shift_in] = trend_i; other[shift_in] = other_i; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CutTrendAndOtherGradient(__global float *inputs_gr, __global const float *plr, __global float *plr_gr, __global const float *trend_gr, __global const float *other_gr ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- constants const int shift_in = i * variables + v; const int step_in = variables; const int shift_plr = v; const int step_plr = 3 * step_in; //--- calc position int pos = -1; int prev_in = 0; int dist = 0; do { pos++; prev_in += dist; dist = (int)fmax(plr[shift_plr + pos * step_plr + 2 * step_in] * lenth, 1); } while(!(prev_in <= i && (prev_in + dist) > i)); //--- get gradient float other_i_gr = other_gr[shift_in]; float trend_i_gr = trend_gr[shift_in] - other_i_gr; //--- calc plr gradient pos = i - prev_in; float sloat_gr = trend_i_gr * pos; float intercept_gr = trend_i_gr; //--- save result plr_gr[shift_plr + pos * step_plr] += sloat_gr; plr_gr[shift_plr + pos * step_plr + step_in] += intercept_gr; inputs_gr[shift_in] = other_i_gr; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CutOneFromAnother(__global const float *inputs, __global const float *cut, __global float *other ) { const size_t i = get_global_id(0); //--- save result other[i] = inputs[i] - cut[i]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CutOneFromAnotherGradient(__global float *inputs_gr, __global float *cut_gr, __global const float *other_gr ) { const size_t i = get_global_id(0); float gr = other_gr[i]; //--- save result inputs_gr[i] = gr; cut_gr[i] = (-gr); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UniTrajPrepare(__global const float *history, __global const float *h_mask, __global const float *future, __global const float *f_mask, __global float *output, const int h_total, const int f_total ) { const size_t i = get_global_id(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- const int shift_in = i * variables + v; const int shift_out = 3 * shift_in; const int shift_f_out = 3 * (h_total * variables + v); //--- history if(i < h_total) { float mask = h_mask[shift_in]; float h = history[shift_in]; float v = (i < (h_total - 1) && mask != 0 ? (history[shift_in + variables] - h) * mask : 0); if(isnan(v) || isinf(v)) v = h = mask = 0; output[shift_out] = h * mask; output[shift_out + 1] = v; output[shift_out + 2] = mask; } //--- future if(i < f_total) { float mask = f_mask[shift_in]; float f = future[shift_in]; float v = (i < (f_total - 1) && mask != 0 ? (future[shift_in + variables] - f) * mask : 0); if(isnan(v) || isinf(v)) v = f = mask = 0; output[shift_f_out + shift_out] = f * mask; output[shift_f_out + shift_out + 1] = v; output[shift_f_out + shift_out + 2] = mask; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UniTrajPrepareGrad(__global float *history_gr, __global float *future_gr, __global const float *output, __global const float *output_gr, const int h_total, const int f_total ) { const size_t i = get_global_id(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- const int shift_in = i * variables + v; const int shift_out = 3 * shift_in; const int shift_f_out = 3 * (h_total * variables + v); //--- history if(i < h_total) { float mask = output[shift_out + 2]; float grad = 0; if(mask > 0) { grad = output_gr[shift_out] * mask; grad -= (i < (h_total - 1) && mask != 0 ? (output_gr[shift_out + 1]) * mask : 0); grad += (i > 0 ? output[shift_out + 1 - 3 * variables] * output[shift_out + 2 - 3 * variables] : 0); if(isnan(grad) || isinf(grad)) grad = 0; //--- } history_gr[shift_in] = grad; } //--- future if(i < f_total) { float mask = output[shift_f_out + shift_out + 2]; float grad = 0; if(mask > 0) { grad = output_gr[shift_f_out + shift_out] * mask; grad -= (i < (h_total - 1) && mask != 0 ? (output_gr[shift_f_out + shift_out + 1]) * mask : 0); grad += (i > 0 ? output[shift_f_out + shift_out + 1 - 3 * variables] * output[shift_f_out + shift_out + 2 - 3 * variables] : 0); if(isnan(grad) || isinf(grad)) grad = 0; //--- } future_gr[shift_in] = grad; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UniTrajBTS(__global const float *concat_inp, __global float *d_forw, __global float *d_bakw, const int total ) { const size_t i = get_global_id(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- if(i == 0) { const int step = variables * 3; const int start = v * 3 + 2; float last = 0; d_forw[v] = 0; //--- for(int p = 1; p < total; p++) { float m = concat_inp[start + p * step]; d_forw[p * variables + v] = last = 1 + (1 - m) * last; } } else { const int step = -(variables * 3); const int start = (total - 1) * variables + v * 3 + 2; float last = 0; d_bakw[(total - 1) + v] = 0; //--- for(int p = 1; p < total; p++) { float m = concat_inp[start + p * step]; d_bakw[(total - 1 - p) * variables + v] = last = 1 + (1 - m) * last; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ float2 Rotate(const float x, const float cos_theta, const float sin_theta) { float2 result = 0; result.s0 = cos_theta + x * sin_theta; result.s1 = x * cos_theta - sin_theta; return result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HiVTPrepare(__global const float *data, __global float2* __attribute__((aligned(8))) output ) { const size_t t = get_global_id(0); const size_t v = get_global_id(1); const size_t total_v = get_global_size(1); //--- const int shift_data = t * total_v; const int shift_out = shift_data * total_v; //--- float value = data[shift_data + v + total_v] - data[shift_data + v]; const float theta = atan(value); const float cos_theta = cos(theta); const float sin_theta = sin(theta); const float2 main = Rotate(value, cos_theta, sin_theta); //--- //--- for(int a = 0; a < total_v; a++) { float2 o = main; if(a != v) o -= Rotate(data[shift_data + a + total_v] - data[shift_data + a], cos_theta, sin_theta); output[shift_out + a] = o; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GateElementMul(__global const float *inputs1, __global const float *inputs2, __global const float *gate, __global float *out ) { const int i = get_global_id(0); //--- const float g = IsNaNOrInf(gate[i], 0.5f); float result = 0; float inp = IsNaNOrInf(inputs1[i], 0.0f); if(inp != 0.0f && g != 0.0f) result += g * inp; inp = IsNaNOrInf(inputs2[i], 0.0f); if(inp != 0.0f && (1 - g) != 0.0f) result += (1 - g) * inp; //--- out[i] = IsNaNOrInf(result, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GateElementMulGrad(__global const float *inputs1, __global float *inputs1_gr, __global const float *inputs2, __global float *inputs2_gr, __global const float *gate, __global float *gate_gr, __global const float *out_gr, const int activ1, const int activ2, const int activ_gate ) { const int i = get_global_id(0); //--- const float g = IsNaNOrInf(gate[i], 0.5f); const float i1 = IsNaNOrInf(inputs1[i], 0); const float i2 = IsNaNOrInf(inputs2[i], 0); const float grad = IsNaNOrInf(out_gr[i], 0); //--- float i1_gr = IsNaNOrInf(grad * g, 0); float i2_gr = IsNaNOrInf(grad * (1 - g), 0); float g_gr = IsNaNOrInf(grad * (i1 - i2), 0); //--- inputs1_gr[i] = Deactivation(i1_gr, i1, activ1); inputs2_gr[i] = Deactivation(i2_gr, i2, activ2); gate_gr[i] = Deactivation(g_gr, g, activ_gate); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TransposeRCD(__global const float *matrix_in, ///<[in] Input matrix __global float *matrix_out ///<[out] Output matrix ) { const int r = get_global_id(0); const int c = get_global_id(1); const int d = get_global_id(2); const int rows = get_global_size(0); const int cols = get_global_size(1); const int dimension = get_global_size(2); //--- matrix_out[(c * rows + r)*dimension + d] = matrix_in[(r * cols + c) * dimension + d]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void OrthoganalLoss(__global const float *data, __global float *grad, const int add ) { const size_t r = get_global_id(0); const size_t c = get_local_id(1); const size_t cols = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; uint ls = min((uint)cols, (uint)LOCAL_ARRAY_SIZE); //--- const int shift1 = r * cols + c; const int shift2 = c * cols + r; float value1 = IsNaNOrInf(data[shift1], 0); float value2 = (shift1 == shift2 ? value1 : IsNaNOrInf(data[shift2], 0)); float v2 = IsNaNOrInf(value1 * value2, 0); //--- for(int i = 0; i < cols; i += ls) { //--- if(i <= c && (i + ls) > c) Temp[c - i] = (i == 0 ? 0 : Temp[c - i]) + v2; BarrierLoc } //--- uint count = min(ls, (uint)cols); do { count = (count + 1) / 2; if(c < ls) Temp[c] += (c < count && (c + count) < cols ? Temp[c + count] : 0); if(c + count < ls) Temp[c + count] = 0; BarrierLoc } while(count > 1); //--- const float sum = Temp[0]; float diff = (float)(r == c) - sum; float loss = -(diff * diff); float g = (2 * (sum - (float)(r == c))) * loss; g = 2 * value2 * g; if(isinf(g) || isnan(g)) g = 0; if(add == 1) grad[shift1] += g; else grad[shift1] = g; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcDistance(__global const float *data, __global float *distance, const int dimension ) { const size_t main = get_global_id(0); const size_t slave = get_local_id(1); const int total = (int)get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- const int shift_main = main * dimension; const int shift_slave = slave * dimension; const int shift_dist = main * total + slave; //--- calc distance float dist = 0; if(main != slave) { //--- for(int d = 0; d < dimension; d++) { float delta = data[shift_main + d] - data[shift_slave + d]; dist += delta * delta; } } //--- Look Max float max_dist = LocalMax(dist, 1, Temp); //--- Normalize if(max_dist > 0) dist /= Temp[0]; dist = IsNaNOrInf(dist, 1); //--- result distance[shift_dist] = dist; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardLocalMax(__global const float *matrix_i, __global const float *distance, __global float *matrix_o, const float radius ) { const size_t i = get_global_id(0); const size_t total = get_global_size(0); const size_t d = get_global_id(1); const size_t dimension = get_global_size(1); //--- const int shift_dist = i * total; const int shift_out = i * dimension + d; //--- float result = -3.402823466e+38f; //--- for(int k = 0; k < total; k++) { if(distance[shift_dist + k] > radius) continue; int shift = k * dimension + d; result = max(result, matrix_i[shift]); } matrix_o[shift_out] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcInputGradientLocalMax(__global const float *matrix_i, __global float *matrix_ig, __global const float *distance, __global const float *matrix_o, __global const float *matrix_g, const float radius ) { const size_t i = get_global_id(0); const size_t total = get_global_size(0); const size_t d = get_global_id(1); const size_t dimension = get_global_size(1); //--- float result = 0; float value = matrix_i[i * dimension + d]; //--- for(int k = 0; k < total; k++) { if(distance[k * total + i] > radius) continue; int shift = k * dimension + d; if(fabs(matrix_o[shift] - value) <= 1.192092896e-07f) result += matrix_g[shift]; } matrix_ig[i * dimension + d] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHMaskAttentionOut(__global const float *q, ///<[in] Matrix of Querys __global const float *kv, ///<[in] Matrix of Keys __global float *score, ///<[out] Matrix of Scores __global const float *mask, ///<[in] Mask Matrix __global float *out, ///<[out] Matrix of attention const int dimension, ///< Dimension of Key const int heads_kv, const float mask_level ) { //--- init const int q_id = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_local_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_k = dimension * (2 * heads_kv * k + h_kv); const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv); const int shift_s = kunits * (q_id * heads + h) + k; const bool b_mask = (mask[shift_s] < mask_level); const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); float koef = sqrt((float)dimension); if(koef < 1) koef = 1; __local float temp[LOCAL_ARRAY_SIZE]; //--- Score float sum = 0; if(b_mask) sum = MIN_VALUE; else for(int d = 0; d < dimension; d++) sum += q[shift_q + d] * kv[shift_k + d]; float sc = LocalSoftMax(IsNaNOrInf(sum / koef, MIN_VALUE), 1, temp); score[shift_s] = sc; BarrierLoc //--- out for(int d = 0; d < dimension; d++) { BarrierLoc sum = LocalSum(IsNaNOrInf(kv[shift_v + d] * sc, 0), 1, temp); if(k == 0) out[shift_q + d] = temp[0]; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHMaskAttentionInsideGradients(__global const float *q, __global float *q_g, __global const float *kv, __global float *kv_g, __global const float *mask, __global float *mask_g, __global const float *scores, __global const float *gradient, const int kunits, const int heads_kv, const float mask_level) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (q_id * heads + h) * kunits; const int shift_g = h * dimension + d; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v = q_id; v < kunits; v += qunits) { float grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v; for(int g = 0; g < qunits; g++) grad += gradient[shift_g + dimension * (hq - h + g * heads)] * scores[shift_score + g * step_score]; } int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d; kv_g[shift_v] = grad; } } //--- Calculating Query's gradients float grad = 0; float out_g = gradient[shift_g + q_id * dimension]; int shift_val = (heads_kv + h_kv) * dimension + d; int shift_key = h_kv * dimension + d; //--- for(int k = 0; k < kunits; k++) { float sc_g = 0; float sc = scores[shift_s + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] * ((float)(k == v) - sc); grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension]; } q_g[shift_q] = grad / koef; //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k = q_id; k < kunits; k += qunits) { int shift_k = dimension * (2 * heads_kv * k + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq++) { int shift_score = hq * kunits + k; float val = kv[shift_k + heads_kv * dimension]; for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] * val * ((float)(k == v) - sc); grad += sc_g * q[shift_q + scr * dimension]; } } kv_g[shift_k] = grad / koef; } } //--- Mask's gradient for(int k = q_id; k < kunits; k += qunits) { float m = mask[shift_s + k]; if(m < mask_level) mask_g[shift_s + k] = 0; else mask_g[shift_s + k] = 1 - m; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcPositionBias(__global const float *data1, __global const float *data2, __global float *result, const int dimension ) { const size_t idx1 = get_global_id(0); const size_t idx2 = get_global_id(1); const size_t total1 = get_global_size(0); const size_t total2 = get_global_size(1); //--- const int shift1 = idx1 * dimension; const int shift2 = idx2 * dimension; const int shift_out = idx1 * total2 + idx2; //--- float res = 0; //--- for(int i = 0; i < dimension; i++) { float delta = data1[shift1 + i] - data2[shift2 + i]; res = delta * delta; } res = sqrt(res); res = exp(-res); if(isnan(res) || isinf(res)) res = 0; //--- result[shift_out] = res; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHPosBiasAttentionOut(__global const float *q, ///<[in] Matrix of Querys __global const float *k, ///<[in] Matrix of Keys __global const float *v, ///<[in] Matrix of Values __global float *score, ///<[out] Matrix of Scores __global const float *pos_bias, ///<[in] Position Bias __global float *out, ///<[out] Matrix of attention const int dimension, ///< Dimension of Key const int heads_kv, const int use_pos_bias ) { //--- init const int q_id = get_global_id(0); const int k_id = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_kv = dimension * (heads_kv * k_id + h_kv); const int shift_s = kunits * (q_id * heads + h) + k_id; const int shift_pb = q_id * kunits + k_id; const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); float koef = sqrt((float)dimension); if(koef < 1) koef = 1; __local float temp[LOCAL_ARRAY_SIZE]; //--- sum of exp uint count = 0; if(k_id < ls) { temp[k_id] = 0; //--- do { if(q_id >= (count * ls + k_id)) if((count * ls) < (kunits - k_id)) { float sum = 0; int sh_k = dimension * heads_kv * count * ls; for(int d = 0; d < dimension; d++) sum = q[shift_q + d] * k[shift_kv + d + sh_k]; sum = exp(sum / koef); if(isnan(sum)) sum = 0; temp[k_id] = temp[k_id] + sum + (use_pos_bias > 0 ? pos_bias[shift_pb + count * ls] : 0); } count++; } while((count * ls + k_id) < kunits); } BarrierLoc count = min(ls, (uint)kunits); //--- //--- do { count = (count + 1) / 2; if(k_id < ls) temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0); if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- score float sum = temp[0]; float sc = 0; if(q_id >= (count * ls + k_id)) if(sum != 0) { //--- for(int d = 0; d < dimension; d++) sc = q[shift_q + d] * k[shift_kv + d]; sc = (exp(sc / koef) + (use_pos_bias > 0 ? pos_bias[shift_pb] : 0)) / sum; if(isnan(sc)) sc = 0; } score[shift_s] = sc; BarrierLoc //--- out //--- for(int d = 0; d < dimension; d++) { uint count = 0; if(k_id < ls) do { if((count * ls) < (kunits - k_id)) { int sh_v = 2 * dimension * heads_kv * count * ls; float sum = v[shift_kv + d + sh_v] * (count == 0 ? sc : score[shift_s + count * ls]); if(isnan(sum)) sum = 0; temp[k_id] = (count > 0 ? temp[k_id] : 0) + sum; } count++; } while((count * ls + k_id) < kunits); BarrierLoc //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k_id < ls) temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0); if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- out[shift_q + d] = temp[0]; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHPosBiasAttentionInsideGradients(__global const float *q, __global float *q_g, __global const float *k, __global float *k_g, __global const float *v, __global float *v_g, __global const float *scores, __global const float *gradient, const int kunits, const int heads_kv) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (q_id * heads + h) * kunits; const int shift_g = h * dimension + d; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v_id = q_id; v_id < kunits; v_id += qunits) { float grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v_id; for(int g = 0; g < qunits; g++) grad += gradient[shift_g + dimension * (hq - h + g * heads)] * scores[shift_score + g * step_score]; } int shift_v = dimension * (heads_kv * v_id + h) + d; v_g[shift_v] = grad; } } //--- Calculating Query's gradients float grad = 0; float out_g = gradient[shift_g + q_id * dimension]; int shift_val = h_kv * dimension + d; int shift_key = h_kv * dimension + d; //--- for(int k_id = 0; k_id < kunits; k_id++) { float sc_g = 0; float sc = scores[shift_s + k_id]; if(sc == 0) continue; for(int v_id = 0; v_id < kunits; v_id++) sc_g += scores[shift_s + v_id] * out_g * v[shift_val + v_id * heads_kv * dimension] * ((float)(k_id == v_id) - sc); grad += sc_g * k[shift_key + k_id * heads_kv * dimension]; } q_g[shift_q] = grad / koef; //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k_id = q_id; k_id < kunits; k_id += qunits) { int shift_k = dimension * (heads_kv * k_id + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + k_id; float val = v[shift_k]; for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k_id]; if(sc == 0) continue; for(int v_id = 0; v_id < kunits; v_id++) sc_g += scores[shift_sc + v_id] * gradient[shift_g + scr * dimension] * val * ((float)(k_id == v_id) - sc); grad += sc_g * q[shift_g + scr * heads * dimension]; } } k_g[shift_k] = grad / koef; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DiversityLoss(__global const float *data, __global float *grad, const int dimension, const int activation, const int add ) { const size_t main = get_global_id(0); const size_t loc = get_local_id(1); const size_t total = get_local_size(0); const size_t total_loc = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- float delts = 0; //--- for(int d = 0; d < dimension; d++) { const float value_main = IsNaNOrInf(data[main * dimension + d], 0); for(int i = loc; i < total; i += total_loc) { float value_slave = IsNaNOrInf(data[i * dimension + d], 0); float delta = value_main - value_slave; delts += IsNaNOrInf(delta * delta / total, 0); } } //--- float loss = exp(LocalSum(delts, 1, Temp)); float gr = 0; //--- for(int d = 0; d < dimension; d++) { const float value_main = IsNaNOrInf(data[main * dimension + d], 0); for(int i = loc; i < total; i += total_loc) { float value_slave = IsNaNOrInf(data[i * dimension + d], 0); gr += IsNaNOrInf(2 * loss * (value_main - value_slave) / total, 0); } //--- gr = LocalSum(gr, 1, Temp); if(loc == 0) { if(add > 0) grad[main * dimension + d] += Deactivation(gr, value_main, activation); else grad[main * dimension + d] = Deactivation(gr, value_main, activation); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHRelativeAttentionOut(__global const float *q, ///<[in] Matrix of Querys __global const float *k, ///<[in] Matrix of Keys __global const float *v, ///<[in] Matrix of Values __global const float *bk, ///<[in] Matrix of Positional Bias Keys __global const float *bv, ///<[in] Matrix of Positional Bias Values __global const float *gc, ///<[in] Global content bias vector __global const float *gp, ///<[in] Global positional bias vector __global float *score, ///<[out] Matrix of Scores __global float *out, ///<[out] Matrix of attention const int dimension ///< Dimension of Key ) { //--- init const int q_id = get_global_id(0); const int k_id = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_local_size(1); const int heads = get_global_size(2); const int shift_q = dimension * (q_id * heads + h); const int shift_kv = dimension * (heads * k_id + h); const int shift_gc = dimension * h; const int shift_s = kunits * (q_id * heads + h) + k_id; const int shift_pb = q_id * kunits + k_id; const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); float koef = sqrt((float)dimension); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- score float sc = 0; //--- for(int d = 0; d < dimension; d++) { float val_q = q[shift_q + d]; float val_k = k[shift_kv + d]; float val_bk = bk[shift_kv + d]; sc += val_q * val_k + val_q * val_bk + val_k * val_bk + gc[shift_q + d] * val_k + gp[shift_q + d] * val_bk; } sc = sc / koef; //--- max value //--- for(int cur_k = 0; cur_k < kunits; cur_k += ls) { if(k_id >= cur_k && k_id < (cur_k + ls)) { int shift_local = k_id % ls; temp[shift_local] = (cur_k == 0 ? sc : fmax(temp[shift_local], sc)); } BarrierLoc } uint count = min(ls, (uint)kunits); //--- do { count = (count + 1) / 2; if(k_id < ls) temp[k_id] = (k_id < count && (k_id + count) < kunits ? fmax(temp[k_id + count], temp[k_id]) : temp[k_id]); BarrierLoc } while(count > 1); sc = IsNaNOrInf(exp(fmax(sc - temp[0], -120)), 0); BarrierLoc //--- sum of exp //--- for(int cur_k = 0; cur_k < kunits; cur_k += ls) { if(k_id >= cur_k && k_id < (cur_k + ls)) { int shift_local = k_id % ls; temp[shift_local] = (cur_k == 0 ? 0 : temp[shift_local]) + sc; } BarrierLoc } //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k_id < ls) temp[k_id] += (k_id < count && (k_id + count) < kunits ? temp[k_id + count] : 0); if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- score float sum = IsNaNOrInf(temp[0], 1); if(fabs(sum) <= 1.2e-7f) sum = 1; sc /= sum; score[shift_s] = sc; BarrierLoc //--- out //--- for(int d = 0; d < dimension; d++) { float val_v = v[shift_kv + d]; float val_bv = bv[shift_kv + d]; float val = IsNaNOrInf(sc * (val_v + val_bv), 0); //--- sum of value for(int cur_v = 0; cur_v < kunits; cur_v += ls) { if(k_id >= cur_v && k_id < (cur_v + ls)) { int shift_local = k_id % ls; temp[shift_local] = (cur_v == 0 ? 0 : temp[shift_local]) + val; } BarrierLoc } //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k_id < count && (k_id + count) < kunits) temp[k_id] += temp[k_id + count]; if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- if(k_id == 0) out[shift_q + d] = IsNaNOrInf(temp[0], 0); BarrierLoc } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHRelativeAttentionInsideGradients(__global const float *q, __global float *q_g, __global const float *k, __global float *k_g, __global const float *v, __global float *v_g, __global const float *bk, __global float *bk_g, __global const float *bv, __global float *bv_g, __global const float *gc, __global float *gc_g, __global const float *gp, __global float *gp_g, __global const float *scores, __global const float *gradient, const int kunits ) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (q_id * heads + h) * kunits; const int shift_g = h * dimension + d; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- Calculating Value's gradients int step_score = kunits * heads; //--- for(int v_id = q_id; v_id < kunits; v_id += qunits) { float grad = 0; int shift_score = h * kunits + v_id; for(int g = 0; g < qunits; g++) grad += gradient[shift_g + dimension * (g * heads)] * scores[shift_score + g * step_score]; int shift_v = dimension * (heads * v_id + h) + d; grad = IsNaNOrInf(grad, 0); v_g[shift_v] = grad; bv_g[shift_v] = grad; } //--- Calculating Query's gradients float grad_gc = 0; float grad_gp = 0; float out_g = gradient[shift_g + q_id * dimension]; int shift_val = h * dimension + d; int shift_key = h * dimension + d; //--- for(int k_id = 0; k_id < kunits; k_id++) { float sc_g = 0; float sc = scores[shift_s + k_id]; if(sc == 0) continue; for(int v_id = 0; v_id < kunits; v_id++) sc_g += scores[shift_s + v_id] * out_g * (v[shift_val + v_id * heads * dimension] + bv[shift_val + v_id * heads * dimension]) * ((float)(k_id == v_id) - sc); grad_gc += IsNaNOrInf(sc_g * k[shift_key + k_id * heads * dimension], 0); grad_gp += IsNaNOrInf(sc_g * bk[shift_key + k_id * heads * dimension], 0); } //--- q_g[shift_q] = (grad_gc + grad_gp) / koef; gc_g[shift_q] = grad_gc / koef; gp_g[shift_q] = grad_gp / koef; //--- Calculating Key's gradients //--- for(int k_id = q_id; k_id < kunits; k_id += qunits) { int shift_k = dimension * (heads * k_id + h) + d; float grad = 0; float grad_bk = 0; int shift_score = h * kunits + k_id; float val = (v[shift_k] + bv[shift_k]); for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k_id]; if(sc == 0) continue; for(int v_id = 0; v_id < kunits; v_id++) sc_g += scores[shift_sc + v_id] * gradient[shift_g + scr * dimension] * val * ((float)(k_id == v_id) - sc); float _q = q[shift_g + scr * heads * dimension]; grad += sc_g * (_q + bk[shift_k] + gc[shift_g + scr * heads * dimension]); grad_bk += sc_g * (_q + k[shift_k] + gp[shift_g + scr * heads * dimension]); } k_g[shift_k] = grad / koef; bk_g[shift_k] = grad_bk / koef; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcAlignmentGradient(__global const float *matrix_o1, __global const float *matrix_o2, __global float *matrix_g1, __global float *matrix_g2, const int activation, const int add) { int i = get_global_id(0); const float out1 = IsNaNOrInf(matrix_o1[i], 0); const float out2 = IsNaNOrInf(matrix_o2[i], 0); float grad1 = Deactivation(out2 - out1, out1, activation); float grad2 = Deactivation(out1 - out2, out2, activation); //--- if(add > 0) { matrix_g1[i] += grad1; matrix_g2[i] += grad2; } else { matrix_g1[i] = grad1; matrix_g2[i] = grad2; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeatureSmoothing(__global const float *feature, __global float *outputs, const int smoothing ) { const size_t pos = get_global_id(0); const size_t d = get_global_id(1); const size_t total = get_global_size(0); const size_t dimension = get_global_size(1); //--- const int shift_input = pos * dimension + d; const int shift_output = dimension * pos * smoothing + d; //--- float value = IsNaNOrInf(feature[shift_input], 0); outputs[shift_output] = value; //--- for(int s = 1; s <= smoothing; s++) { if((pos - s) >= 0) value += IsNaNOrInf(feature[shift_input - s * dimension], 0); if((pos + s) < total) value += IsNaNOrInf(feature[shift_input + s * dimension], 0); float factor = IsNaNOrInf(1.0f / (min((int)total, (int)(pos + s)) - max((int)(pos - s), 0) + 1), 0); float out = IsNaNOrInf(value * factor, 0); outputs[shift_output + s * dimension] = out; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeatureSmoothingGradient(__global float *feature_gr, __global const float *outputs_gr, const int smoothing ) { const size_t pos = get_global_id(0); const size_t d = get_global_id(1); const size_t total = get_global_size(0); const size_t dimension = get_global_size(1); //--- const int shift_input = pos * dimension + d; const int shift_output = dimension * pos * smoothing + d; const int step_output = dimension * smoothing; //--- float grad = IsNaNOrInf(outputs_gr[shift_output], 0); //--- for(int s = 1; s <= smoothing; s++) { int shift = shift_output + s * dimension; float factor = 1.0f / (min((int)total, (int)(pos + s)) - max((int)(pos - s), 0) + 1); //--- float value = IsNaNOrInf(outputs_gr[shift] * factor, 0); //--- if((pos - s) >= 0) grad += IsNaNOrInf(outputs_gr[shift - s * step_output] * factor, 0); //--- if((pos + s) < total) grad += IsNaNOrInf(outputs_gr[shift + s * step_output] * factor, 0); } //--- feature_gr[shift_input] = IsNaNOrInf(grad, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void BatchFeedForwardAddNoise(__global const float *inputs, __global float *options, __global const float *noise, __global float *output, const int batch, const int optimization, const int activation, const float alpha) { if(batch <= 1) return; int n = get_global_id(0); int shift = n * (optimization == 0 ? 7 : 9); //--- float inp = inputs[n]; float mean = (batch > 1 ? (IsNaNOrInf(options[shift], 0) * ((float)batch - 1.0f) + inp) / ((float)batch) : inp); float delt = inp - mean; float variance = IsNaNOrInf(options[shift + 1], 0) * ((float)batch - 1.0f) + delt * delt; if(batch > 0) variance /= (float)batch; float nx = (variance > 0 ? delt / sqrt(variance) : 0); float noisex = sqrt(alpha) * nx + sqrt(1 - alpha) * fabs(noise[n]) * sign(nx); //--- float gamma = IsNaNOrInf(options[shift + 3], 0); if(gamma == 0) { options[shift + 3] = 1; gamma = 1; } float betta = IsNaNOrInf(options[shift + 4], 0); //--- options[shift] = mean; options[shift + 1] = variance; options[shift + 2] = nx; output[n] = fActivation(gamma * noisex + betta, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HyperProjection(__global const float *inputs, __global float *outputs ) { const size_t pos = get_global_id(0); const size_t d = get_local_id(1); const size_t total = get_global_size(0); const size_t dimension = get_local_size(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE); //--- const int shift_in = pos * dimension + d; const int shift_out = pos * (dimension + 1) + d + 1; //--- float v = IsNaNOrInf(inputs[shift_in], 0); //--- float v2 = IsNaNOrInf(v * v, 0); //--- if(d < ls) temp[d] = v2; BarrierLoc //--- for(int i = ls; i < (int)dimension; i += ls) { if(d >= i && d < (i + ls)) temp[d % ls] += v2; BarrierLoc } //--- int count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); //--- outputs[shift_out] = v; if(d == 0) { v = IsNaNOrInf(((float)pos) / ((float)total), 0); outputs[shift_out - 1] = sqrt(fmax(temp[0] - v * v, 1.2e-07f)); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void HyperProjectionGrad(__global const float *inputs, __global float *inputs_gr, __global const float *outputs_gr ) { const size_t pos = get_global_id(0); const size_t d = get_global_id(1); const size_t total = get_global_size(0); const size_t dimension = get_global_size(1); //--- const int shift_in = pos * dimension + d; const int shift_start_out = pos * (dimension + 1); const int shift_out = shift_start_out + d + 1; //--- float v = IsNaNOrInf(inputs[shift_in], 0); float grad = IsNaNOrInf(outputs_gr[shift_out], 0); //--- v = IsNaNOrInf(v * outputs_gr[shift_start_out], 0); //--- inputs_gr[shift_in] = v + grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LogMap(__global const float *features, __global const float *centroids, __global const float *curvatures, __global float *outputs, __global float *product, __global float *distance, __global float *norma ) { //--- identify const size_t f = get_global_id(0); const size_t cent = get_global_id(1); const size_t d = get_local_id(2); const size_t total_f = get_global_size(0); const size_t total_cent = get_global_size(1); const size_t dimension = get_local_size(2); //--- create local array __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE); //--- calc shifts const int shift_f = f * dimension + d; const int shift_out = (f * total_cent + cent) * dimension + d; const int shift_cent = cent * dimension + d; const int shift_temporal = f * total_cent + cent; //--- load inputs float feature = IsNaNOrInf(features[shift_f], 0); float centroid = IsNaNOrInf(centroids[shift_cent], 0); float curv = IsNaNOrInf(curvatures[cent], 1.2e-7f); //--- dot(features, centroids) float fc = IsNaNOrInf(feature * centroid, 0); //--- if(d < ls) temp[d] = (d > 0 ? fc : -fc); BarrierLoc //--- for(int i = ls; i < (int)dimension; i += ls) { if(d >= i && d < (i + ls)) temp[d % ls] += fc; BarrierLoc } //--- int count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); float prod = IsNaNOrInf(temp[0], 0); product[shift_temporal] = prod; //--- project float u = IsNaNOrInf(feature + prod * centroid * curv, 0); //--- norm(u) float u2 = IsNaNOrInf(u * u, 0); //--- if(d < ls) temp[d] = (d > 0 ? u2 : -u2); BarrierLoc //--- for(int i = ls; i < (int)dimension; i += ls) { if(d >= i && d < (i + ls)) temp[d % ls] += u2; BarrierLoc } //--- count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); float normu = IsNaNOrInf(temp[0], 0); if(normu <= 0) normu = 1.0e-7f; normu = sqrt(normu); norma[shift_temporal] = normu; //--- distance features to centroid float theta = IsNaNOrInf(-prod * curv, 0); theta = fmax(theta, 1.0f + 1.2e-07f); float acosh_theta = acosh(theta); float dist = IsNaNOrInf(sqrt(clamp((acosh_theta * acosh_theta) / curv, 0.0f, 50.0f)), 0); distance[shift_temporal] = dist; float proj_u = IsNaNOrInf(dist * u / normu, 0); //--- if(d < ls) temp[d] = (d > 0 ? proj_u * centroid : 0); BarrierLoc //--- for(int i = ls; i < (int)dimension; i += ls) { if(d >= i && d < (i + ls)) temp[d % ls] += proj_u * centroid; BarrierLoc } //--- count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); //--- if(d == 0) { proj_u = IsNaNOrInf(temp[0] / centroid, 0); proj_u = fmax(u, 1.2e-7f); } //--- outputs[shift_out] = proj_u; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LogMapGrad(__global const float *features, __global float *features_gr, __global const float *centroids, __global float *centroids_gr, __global const float *curvatures, __global float *curvatures_gr, __global const float *outputs, __global const float *outputs_gr, __global const float *product, __global const float *distance, __global const float *norma ) { //--- identify const size_t f = get_global_id(0); const size_t cent = get_global_id(1); const size_t d = get_local_id(2); const size_t total_f = get_global_size(0); const size_t total_cent = get_global_size(1); const size_t dimension = get_local_size(2); //--- create local array __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((int)dimension, (int)LOCAL_ARRAY_SIZE); //--- calc shifts const int shift_f = f * dimension + d; const int shift_out = (f * total_cent + cent) * dimension + d; const int shift_cent = cent * dimension + d; const int shift_temporal = f * total_cent + cent; //--- load inputs float feature = features[shift_f]; if(isinf(feature) || isnan(feature)) feature = 0; float centroid = centroids[shift_cent]; if(isinf(centroid) || isnan(centroid)) centroid = 0; float centroid0 = (d > 0 ? centroids[shift_cent - d] : centroid); if(isinf(centroid0) || isnan(centroid0) || centroid0 == 0) centroid0 = 1.2e-7f; float curv = curvatures[cent]; if(isinf(curv) || isnan(curv)) curv = 1.2e-7f; float prod = product[shift_temporal]; float dist = distance[shift_temporal]; float normu = norma[shift_temporal]; float u = feature + prod * centroid * curv; if(isinf(u) || isnan(u)) u = 0; //--- float grad = outputs_gr[shift_out]; if(isinf(grad) || isnan(grad)) grad = 0; float grad0 = (d > 0 ? outputs_gr[shift_out - d] : grad); if(isinf(grad0) || isnan(grad0)) grad0 = 0; //--- float feature_gr = 0; float centroid_gr = 0; float curv_gr = 0; float prod_gr = 0; float normu_gr = 0; float dist_gr = 0; //--- float proj_u_gr = (d > 0 ? grad + grad0 / centroid0 * centroid : 0); if(d == 0) centroid_gr += outputs[shift_out] / centroid * grad; else centroid_gr += grad0 / centroid0 * outputs[shift_out]; if(isnan(centroid_gr) || isinf(centroid_gr)) centroid_gr = 0; //--- dist_gr = u / normu * proj_u_gr; float u_gr = dist / normu * proj_u_gr; normu_gr = dist * u / (normu * normu) * proj_u_gr; //--- if(d < ls) temp[d] = dist_gr; BarrierLoc //--- for(int id = ls; id < (int)dimension; id += ls) { if(d >= id && d < (id + ls)) temp[d % ls] += dist_gr; BarrierLoc } //--- int count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); if(isinf(temp[0]) || isnan(temp[0])) temp[0] = 0; dist_gr = temp[0]; //--- if(d == 0) { float theta = -prod * curv; float theta_gr = 1.0f / sqrt(curv * (theta * theta - 1)) * dist_gr; if(isinf(theta_gr) || isnan(theta_gr)) theta_gr = 0; float acosh_theta = acosh(theta); curv_gr += -(acosh_theta * acosh_theta) / (2 * sqrt(curv * curv * curv)) * dist_gr; if(isinf(curv_gr) || isnan(curv_gr)) curv_gr = 0; temp[0] = -curv * theta_gr; if(isinf(temp[0]) || isnan(temp[0])) temp[0] = 0; curv_gr += -prod * theta_gr; if(isinf(curv_gr) || isnan(curv_gr)) curv_gr = 0; } BarrierLoc //--- prod_gr += temp[0]; BarrierLoc //--- if(d < ls) temp[d] = normu_gr; BarrierLoc //--- for(int id = ls; id < (int)dimension; id += ls) { if(d >= id && d < (id + ls)) temp[d % ls] += normu_gr; BarrierLoc } //--- count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); normu_gr = temp[0]; if(isinf(normu_gr) || isnan(normu_gr)) normu_gr = 1.2e-7f; u_gr += u / normu * normu_gr; if(isnan(u_gr) || isinf(u_gr)) u_gr = 0; //--- feature_gr += u_gr; centroid_gr += prod * curv * u_gr; BarrierLoc //--- dot (u_gr * centroid) if(d < ls) temp[d] = u_gr * centroid; BarrierLoc //--- for(int id = ls; id < (int)dimension; id += ls) { if(d >= id && d < (id + ls)) temp[d % ls] += u_gr * centroid; BarrierLoc } //--- count = min(ls, (int)dimension); //--- do { count = (count + 1) / 2; if(d < count) temp[d] += ((d + count) < dimension ? temp[d + count] : 0); if(d + count < dimension) temp[d + count] = 0; BarrierLoc } while(count > 1); if(d == 0) { if(isinf(temp[0]) || isnan(temp[0])) temp[0] = 0; prod_gr += temp[0] * curv; if(isinf(prod_gr) || isnan(prod_gr)) prod_gr = 0; curv_gr += temp[0] * prod; if(isinf(curv_gr) || isnan(curv_gr)) curv_gr = 0; temp[0] = prod_gr; } BarrierLoc //--- prod_gr = temp[0]; feature_gr += prod_gr * centroid * (d > 0 ? 1 : -1); centroid_gr += prod_gr * feature * (d > 0 ? 1 : -1); //--- result features_gr[shift_f] += feature_gr; centroids_gr[shift_cent] += centroid_gr; if(f == 0 && d == 0) curvatures_gr[cent] += curv; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcEpsilonWeights(__global const float *matrix_w, __global const float *matrix_g, __global const float *matrix_i, __global float *matrix_epsw, const float rho ) { const size_t inp = get_local_id(0); const size_t inputs = get_local_size(0) - 1; const size_t out = get_global_id(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((int)inputs, (int)LOCAL_ARRAY_SIZE); //--- const int shift_w = out * (inputs + 1) + inp; const float w = IsNaNOrInf(matrix_w[shift_w], 0); float grad = fabs(w) * IsNaNOrInf(matrix_g[out], 0) * (inputs == inp ? 1.0f : IsNaNOrInf(matrix_i[inp], 0)); //--- const int local_shift = inp % ls; //--- for(int i = 0; i <= inputs; i += ls) { if(i <= inp && inp < (i + ls)) temp[local_shift] = (i == 0 ? 0 : temp[local_shift]) + IsNaNOrInf(grad * grad, 0); BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(inp < count) temp[inp] += ((inp + count) < inputs ? IsNaNOrInf(temp[inp + count], 0) : 0); if(inp + count < inputs) temp[inp + count] = 0; BarrierLoc } while(count > 1); //--- float norm = sqrt(IsNaNOrInf(temp[0], 0)); float epsw = IsNaNOrInf(w * w * grad * rho / (norm + 1.2e-7f), w); //--- matrix_epsw[shift_w] = epsw; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcEpsilonWeightsConv(__global const float *matrix_w, __global const float *matrix_g, __global const float *matrix_i, __global float *matrix_epsw, const int inputs, const float rho, const int step ) { //--- const size_t inp = get_local_id(0); const size_t window_in = get_local_size(0) - 1; const size_t out = get_global_id(1); const size_t window_out = get_global_size(1); const size_t v = get_global_id(2); const size_t variables = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; const int ls = min((int)(window_in + 1), (int)LOCAL_ARRAY_SIZE); //--- const int shift_w = (out + v * window_out) * (window_in + 1) + inp; const int total = (inputs - window_in + step - 1) / step; const int shift_out = v * total * window_out + out; const int shift_in = v * inputs + inp; const float w = IsNaNOrInf(matrix_w[shift_w], 0); //--- float grad = 0; //--- for(int t = 0; t < total; t++) { if(inp != window_in && (inp + t * step) >= inputs) break; float g = IsNaNOrInf(matrix_g[t * window_out + shift_out], 0); float i = IsNaNOrInf(inp == window_in ? 1.0f : matrix_i[t * step + shift_in], 0); grad += IsNaNOrInf(g * i, 0); } grad *= fabs(w); //--- const int local_shift = inp % ls; //--- for(int i = 0; i <= inputs; i += ls) { if(i <= inp && inp < (i + ls)) temp[local_shift] = (i == 0 ? 0 : temp[local_shift]) + IsNaNOrInf(grad * grad, 0); BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(inp < count && (inp + count) < inputs) { temp[inp] += IsNaNOrInf(temp[inp + count], 0); temp[inp + count] = 0; } BarrierLoc } while(count > 1); //--- float norm = sqrt(IsNaNOrInf(temp[0], 0)); float epsw = IsNaNOrInf(w * w * grad * rho / (norm + 1.2e-7f), w); //--- matrix_epsw[shift_w] = epsw; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PLRMultiAgents(__global const float *inputs, __global float *outputs, __global int *isttp, const int transpose, __global const float *min_step ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); const size_t a = get_global_id(2); const size_t agents = get_global_size(2); //--- constants const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i)); const int step_in = ((bool)transpose ? variables : 1); const int shift_ag = a * lenth * variables; //--- look for ttp float value = IsNaNOrInf(inputs[shift_in], 0); bool bttp = false; if(i == 0 || i == lenth - 1) bttp = true; else { float prev = value; int prev_pos = i; float max_v = value; float max_pos = i; float min_v = value; float min_pos = i; while(fmax(fabs(prev - max_v), fabs(prev - min_v)) < min_step[a] && prev_pos > 0) { prev_pos--; prev = IsNaNOrInf(inputs[shift_in - (i - prev_pos) * step_in], 0); if(prev >= max_v && (prev - min_v) < min_step[a]) { max_v = prev; max_pos = prev_pos; } if(prev <= min_v && (max_v - prev) < min_step[a]) { min_v = prev; min_pos = prev_pos; } } //--- float next = value; int next_pos = i; while(fmax(fabs(next - max_v), fabs(next - min_v)) < min_step[a] && next_pos < (lenth - 1)) { next_pos++; next = IsNaNOrInf(inputs[shift_in + (next_pos - i) * step_in], 0); if(next > max_v && (next - min_v) < min_step[a]) { max_v = next; max_pos = next_pos; } if(next < min_v && (max_v - next) < min_step[a]) { min_v = next; min_pos = next_pos; } } //--- if( (value >= prev && value > next) || (value > prev && value == next) || (value <= prev && value < next) || (value < prev && value == next) ) if(max_pos == i || min_pos == i) bttp = true; } //--- isttp[shift_in + shift_ag] = (int)bttp; outputs[shift_in + shift_ag] = 0; BarrierLoc //--- calc position int pos = -1; int prev_in = 0; int prev_ttp = 0; if(bttp) { pos = 0; //--- for(int p = 0; p < i; p++) { int current_in = ((bool)transpose ? (p * variables + v) : (v * lenth + p)); if((bool)isttp[current_in + shift_ag]) { pos++; prev_ttp = p; prev_in = current_in; } } } //--- cacl tendency if(pos > 0 && pos < (lenth / 3)) { float sum_x = 0; float sum_y = 0; float sum_xy = 0; float sum_xx = 0; int dist = i - prev_ttp; //--- for(int p = 0; p < dist; p++) { float x = (float)(p); float y = IsNaNOrInf(inputs[prev_in + p * step_in], 0); sum_x += x; sum_y += y; sum_xy += x * y; sum_xx += x * x; } float slope = IsNaNOrInf((dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1), 0); float intercept = IsNaNOrInf((sum_y - slope * sum_x) / dist, 0); int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)) + shift_ag; outputs[shift_out] = slope; outputs[shift_out + step_in] = intercept; outputs[shift_out + 2 * step_in] = ((float)dist) / lenth; } else { if(pos == (lenth / 3)) { float sum_x = 0; float sum_y = 0; float sum_xy = 0; float sum_xx = 0; int dist = lenth - prev_ttp; //--- for(int p = 0; p < dist; p++) { float x = (float)(p); float y = IsNaNOrInf(inputs[prev_in + p * step_in], 0); sum_x += x; sum_y += y; sum_xy += x * y; sum_xx += x * x; } float slope = IsNaNOrInf((dist * sum_xy - sum_x * sum_y) / (dist > 1 ? (dist * sum_xx - sum_x * sum_x) : 1), 0); float intercept = IsNaNOrInf((sum_y - slope * sum_x) / dist, 0); int shift_out = ((bool)transpose ? ((pos - 1) * 3 * variables + v) : (v * lenth + (pos - 1) * 3)) + shift_ag; outputs[shift_out] = slope; outputs[shift_out + step_in] = intercept; outputs[shift_out + 2 * step_in] = IsNaNOrInf((float)dist / lenth, 0); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PLRMultiAgentsGradient(__global float *inputs_gr, __global const float *outputs, __global const float *outputs_gr, const int transpose, const int agents ) { const size_t i = get_global_id(0); const size_t lenth = get_global_size(0); const size_t v = get_global_id(1); const size_t variables = get_global_size(1); //--- constants const int shift_in = ((bool)transpose ? (i * variables + v) : (v * lenth + i)); const int step_in = ((bool)transpose ? variables : 1); const int shift_out = ((bool)transpose ? v : (v * lenth)); const int step_out = 3 * step_in; const int shift_ag = lenth * variables; //--- Sum gradient float grad = 0; //--- for(int a = 0; a < agents; a++) { //--- calc position int pos = -1; int prev_in = 0; int dist = 0; do { pos++; prev_in += dist; dist = (int)fmax(outputs[shift_out + pos * step_out + 2 * step_in + a * shift_ag] * lenth, 1); } while(!(prev_in <= i && (prev_in + dist) > i)); //--- calc constants float sum_x = 0; float sum_xx = 0; for(int p = 0; p < dist; p++) { float x = (float)(p); sum_x += x; sum_xx += x * x; } //--- get output gradient float grad_slope = IsNaNOrInf(outputs_gr[shift_out + pos * step_out + a * shift_ag], 0); float grad_intercept = IsNaNOrInf(outputs_gr[shift_out + pos * step_out + step_in + a * shift_ag], 0); //--- calc gradient grad_slope -= IsNaNOrInf(sum_x / dist * grad_intercept, 0); grad_slope /= fmax(IsNaNOrInf(dist * sum_xx - sum_x * sum_x, 0), 1); grad += IsNaNOrInf(grad_intercept / dist, 0); grad += IsNaNOrInf((dist * (i - prev_in) - sum_x) * grad_slope, 0); } grad = clamp(IsNaNOrInf(grad / agents, 0), -MAX_GRAD, MAX_GRAD); //--- save result inputs_gr[shift_in] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardMHConv(__global float *matrix_w, __global float *matrix_i, __global float *matrix_o, const int inputs, const int step, const int window_in, const int window_out, const int activation ) { const size_t i = get_global_id(0); const size_t h = get_global_id(1); const size_t v = get_global_id(2); const size_t total = get_global_size(0); const size_t heads = get_global_size(1); //--- const int window_in_h = (window_in + heads - 1) / heads; const int window_out_h = (window_out + heads - 1) / heads; const int shift_out = window_out * i + window_out_h * h; const int shift_in = step * i + window_in_h * h; //--- const int shift_var_in = v * inputs; const int shift_var_out = v * window_out * total; const int shift_var_w = v * window_out * (window_in_h + 1); const int shift_w_h = h * window_out_h * (window_in_h + 1); //--- float sum = 0; float4 inp, weight; int stop = (window_in_h <= (inputs - shift_in) ? window_in_h : (inputs - shift_in)); stop = min(stop, (int)(window_in - h * window_in_h)); //--- //--- for(int out = 0; (out < window_out_h && (window_out_h * h + out) < window_out); out++) { int shift = (window_in_h + 1) * out + shift_w_h; for(int k = 0; k <= stop; k += 4) { switch(stop - k) { case 0: inp = (float4)(1, 0, 0, 0); weight = (float4)(matrix_w[shift_var_w + shift + window_in_h], 0, 0, 0); break; case 1: inp = (float4)(matrix_i[shift_var_in + shift_in + k], 1, 0, 0); weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + window_in_h], 0, 0); break; case 2: inp = (float4)(matrix_i[shift_var_in + shift_in + k], matrix_i[shift_var_in + shift_in + k + 1], 1, 0); weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1], matrix_w[shift_var_w + shift + window_in_h], 0); break; case 3: inp = (float4)(matrix_i[shift_var_in + shift_in + k], matrix_i[shift_var_in + shift_in + k + 1], matrix_i[shift_var_in + shift_in + k + 2], 1); weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1], matrix_w[shift_var_w + shift + k + 2], matrix_w[shift_var_w + shift + shift_w_h]); break; default: inp = (float4)(matrix_i[shift_var_in + shift_in + k], matrix_i[shift_var_in + shift_in + k + 1], matrix_i[shift_var_in + shift_in + k + 2], matrix_i[shift_var_in + shift_in + k + 3]); weight = (float4)(matrix_w[shift_var_w + shift + k], matrix_w[shift_var_w + shift + k + 1], matrix_w[shift_var_w + shift + k + 2], matrix_w[shift_var_w + shift + k + 3]); break; } sum += IsNaNOrInf(dot(inp, weight), 0); } sum = IsNaNOrInf(sum, 0); //--- matrix_o[shift_var_out + out + shift_out] = fActivation(sum, activation);; } } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_gr /// Kernel of the Convolution neuron to transfer gradient //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientMHConv(__global float *matrix_w, __global float *matrix_g, __global float *matrix_o, __global float *matrix_ig, const int outputs, const int step, const int window_in, const int window_out, const int activation, const int shift_out, const int heads ) { const size_t i = get_global_id(0); const size_t inputs = get_global_size(0); const size_t v = get_global_id(1); //--- const int shift_var_in = v * inputs; const int shift_var_out = v * outputs; const int shift_var_w = v * window_out * (window_in + 1); const int window_in_h = (window_in + heads - 1) / heads; const int window_out_h = (window_out + heads - 1) / heads; //--- float sum = 0; float out = matrix_o[shift_var_in + i]; const int w_start = i % step; const int start = max((int)((i - window_in + step) / step), 0); int stop = (w_start + step - 1) / step; stop = min((int)((i + step - 1) / step + 1), stop) + start; if(stop > (outputs / window_out)) stop = outputs / window_out; //--- for(int k = start; k < stop; k++) { int head = (k % window_out) / window_out_h; for(int h = 0; h < window_out_h; h ++) { if((head * window_out_h + h) >= window_out) break; int shift_g = k * window_out + head * window_out_h + h; int shift_w = (stop - k - 1) * step + (i % step) / window_in_h + head * (window_in_h + 1) + h * (window_in_h + 1); if(shift_g >= outputs || shift_w >= (window_in_h + 1) * window_out) break; sum += IsNaNOrInf(matrix_g[shift_out + shift_g + shift_var_out] * matrix_w[shift_w + shift_var_w], 0); } } //--- matrix_ig[shift_var_in + i] = Deactivation(sum, out, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating /// Weights Calculation kernel /// Describes the process of Adam optimization weights for the Convolution /// Neuron (#CNeuronConvOCL). //+------------------------------------------------------------------+ __kernel void UpdateWeightsMHConvAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - ///< input window and n - output window __global const float *matrix_g, ///<[in] Tensor of gradients at current layer __global const float *matrix_i, ///<[in] Inputs tensor __global float *matrix_m, ///<[in] Matrix of first momentum __global float *matrix_v, ///<[in] Matrix of seconfd momentum const int inputs, ///< Number of inputs const float l, ///< Learning rates const float b1, ///< First momentum multiplier const float b2, ///< Second momentum multiplier const int window_in, ///< Size of input window const int window_out, ///< Size of output window const int step, ///< Step size const int heads ) { const size_t i = get_global_id(0); //--- const int window_in_h = (window_in + heads - 1) / heads; const int v = i / ((window_in_h + 1) * window_out); const int shift = i % window_out; const int shift_out = i / (window_in_h + 1) - v; const int total = (inputs - window_in + step - 1) / step; //--- const int shift_var_in = v * inputs; const int shift_var_out = v * total * window_out; //--- float grad = 0; //--- for(int t = 0; t < total; t++) { if(shift != window_in_h && (shift + t * window_in) >= inputs) break; grad += IsNaNOrInf(matrix_g[t * window_out + shift_out + shift_var_out] * (shift == window_in_h ? 1 : matrix_i[shift + t * step + shift_var_in]), 0); } float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0); float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f); float weight = matrix_w[i] + l * mt / sqrt(vt); matrix_w[i] = weight; matrix_m[i] = mt; matrix_v[i] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MoreLessEqual(__global const float *input, __global float *output) { const size_t i = get_global_id(0); const float value = IsNaNOrInf(input[i], 0); float result = 0; if(fabs(value) > 1.2e-7f) { if(value > 0) result = 1; else result = -1; } output[i] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MultiScaleRelativeAttentionOut(__global const float *q, ///<[in] Matrix of Querys __global const float *k, ///<[in] Matrix of Keys __global const float *v, ///<[in] Matrix of Values __global const float *bk, ///<[in] Matrix of Positional Bias Keys __global const float *bv, ///<[in] Matrix of Positional Bias Values __global const float *gc, ///<[in] Global content bias vector __global const float *gp, ///<[in] Global positional bias vector __global float *score, ///<[out] Matrix of Scores __global float *out, ///<[out] Matrix of attention const int dimension ///< Dimension of Key ) { //--- init const uint q_id = get_global_id(0); const uint k_id = get_local_id(1); const uint h = get_global_id(2); const uint qunits = get_global_size(0); const uint kunits = get_local_size(1); const uint heads = get_global_size(2); const int shift_q = dimension * (q_id * heads + h); const int shift_kv = dimension * (heads * k_id + h); const int shift_gc = dimension * h; const int shift_s = kunits * (q_id * heads + h) + k_id; const int shift_pb = q_id * kunits + k_id; const uint ls = min((uint)get_local_size(1), (uint)LOCAL_ARRAY_SIZE); const uint window = max((uint)((kunits + h) / (h + 1)), min((uint)3, kunits)); float koef = sqrt((float)dimension); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- score float sc = 0; if(k_id < window) { //--- for(int d = 0; d < dimension; d++) { float val_q = q[shift_q + d]; float val_k = k[shift_kv + d]; float val_bk = bk[shift_kv + d]; sc += val_q * val_k + val_q * val_bk + val_k * val_bk + gc[shift_q + d] * val_k + gp[shift_q + d] * val_bk; } sc = sc / koef; } //--- max value //--- for(int cur_k = 0; cur_k < kunits; cur_k += ls) { if(k_id < window) if(k_id >= cur_k && k_id < (cur_k + ls)) { int shift_local = k_id % ls; temp[shift_local] = (cur_k == 0 ? sc : fmax(temp[shift_local], sc)); } BarrierLoc } uint count = min(ls, kunits); //--- //--- do { count = (count + 1) / 2; if(k_id < (window + 1) / 2) if(k_id < ls) temp[k_id] = (k_id < count && (k_id + count) < kunits ? fmax(temp[k_id + count], temp[k_id]) : temp[k_id]); BarrierLoc } while(count > 1); if(k_id < window) sc = IsNaNOrInf(exp(fmax(sc - temp[0], -120)), 0); BarrierLoc //--- sum of exp //--- for(int cur_k = 0; cur_k < kunits; cur_k += ls) { if(k_id >= cur_k && k_id < (cur_k + ls)) { int shift_local = k_id % ls; temp[shift_local] = (cur_k == 0 ? 0 : temp[shift_local]) + sc; } BarrierLoc } //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k_id < count && k_id < (window + 1) / 2) temp[k_id] += ((k_id + count) < kunits ? temp[k_id + count] : 0); if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- score float sum = IsNaNOrInf(temp[0], 1); if(sum <= 1.2e-7f) sum = 1; sc /= sum; score[shift_s] = sc; BarrierLoc //--- out int shift_local = k_id % ls; //--- for(int d = 0; d < dimension; d++) { float val_v = v[shift_kv + d]; float val_bv = bv[shift_kv + d]; float val = IsNaNOrInf(sc * (val_v + val_bv), 0); //--- sum of value for(int cur_v = 0; cur_v < kunits; cur_v += ls) { if(k_id >= cur_v && k_id < (cur_v + ls)) temp[shift_local] = (cur_v == 0 ? 0 : temp[shift_local]) + val; BarrierLoc } //--- count = min(ls, (uint)kunits); do { count = (count + 1) / 2; if(k_id < count && (k_id + count) < kunits) temp[k_id] += temp[k_id + count]; if(k_id + count < ls) temp[k_id + count] = 0; BarrierLoc } while(count > 1); //--- if(k_id == 0) out[shift_q + d] = IsNaNOrInf(temp[0], 0); BarrierLoc } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SSM2D_FeedForward(__global const float *ah, __global const float *b_time, __global const float *b_var, __global const float *px_time, __global const float *px_var, __global const float *c_time, __global const float *c_var, __global const float *delta_time, __global const float *delta_var, __global float *hidden, __global float *y ) { const size_t n = get_local_id(0); const size_t d = get_global_id(1); const size_t n_total = get_local_size(0); const size_t d_total = get_global_size(1); //--- Hidden state //--- for(int h = 0; h < 2; h++) { float new_h = ah[(2 * n + h) * d_total + d] + ah[(2 * n_total + 2 * n + h) * d_total + d]; if(h == 0) new_h += b_time[n] * px_time[n * d_total + d]; else new_h += b_var[n] * px_var[n * d_total + d]; hidden[(h * n_total + n)*d_total + d] = IsNaNOrInf(new_h, 0); } BarrierLoc //--- Output uint shift_c = n; uint shift_h1 = d; uint shift_h2 = shift_h1 + n_total * d_total; float value = 0; //--- for(int i = 0; i < n_total; i++) { value += IsNaNOrInf(c_time[shift_c] * delta_time[shift_c] * hidden[shift_h1], 0); value += IsNaNOrInf(c_var[shift_c] * delta_var[shift_c] * hidden[shift_h2], 0); shift_c += n_total; shift_h1 += d_total; shift_h2 += d_total; } //--- y[n * d_total + d] = IsNaNOrInf(value, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SSM2D_CalcHiddenGradient(__global const float *ah, __global float *grad_ah, // Gradient with respect to ah __global const float *b_time, __global float *grad_b_time, // Gradient with respect to b_time __global const float *b_var, __global float *grad_b_var, // Gradient with respect to b_var __global const float *px_time, __global float *grad_px_time, // Gradient with respect to px_time __global const float *px_var, __global float *grad_px_var, // Gradient with respect to px_var __global const float *c_time, __global float *grad_c_time, // Gradient with respect to c_time __global const float *c_var, __global float *grad_c_var, // Gradient with respect to c_var __global const float *delta_time, __global float *grad_delta_time, // Gradient with respect to delta_time __global const float *delta_var, __global float *grad_delta_var, // Gradient with respect to delta_var __global const float *hidden, __global const float *grad_y // Gradient of loss with respect to y ) { //--- const size_t n = get_global_id(0); const size_t d = get_local_id(1); const size_t n_total = get_global_size(0); const size_t d_total = get_local_size(1); //--- Initialize indices for data access uint shift_c = n; uint shift_h1 = d; uint shift_h2 = shift_h1 + n_total * d_total; float grad_hidden1 = 0; float grad_hidden2 = 0; //--- Backpropagation: compute hidden gradients from y for(int i = 0; i < n_total; i++) { float grad = grad_y[i * d_total + d]; float c_t = c_time[shift_c]; float c_v = c_var[shift_c]; float delta_t = delta_time[shift_c]; float delta_v = delta_var[shift_c]; float h1 = hidden[shift_h1]; float h2 = hidden[shift_h2]; //-- Accumulate gradients for hidden states grad_hidden1 += IsNaNOrInf(grad * c_t * delta_t, 0); grad_hidden2 += IsNaNOrInf(grad * c_v * delta_v, 0); //--- Compute gradients for c_time, c_var, delta_time, delta_var grad_c_time[shift_c] += grad * delta_t * h1; grad_c_var[shift_c] += grad * delta_v * h2; grad_delta_time[shift_c] += grad * c_t * h1; grad_delta_var[shift_c] += grad * c_v * h2; //--- Update indices for the next element shift_c += n_total; shift_h1 += d_total; shift_h2 += d_total; } //--- Backpropagate through hidden -> ah, b_time, px_time for(int h = 0; h < 2; h++) { float grad_h = (h == 0) ? grad_hidden1 : grad_hidden2; //--- Store gradients in ah (considering its influence on two elements) grad_ah[(2 * n + h) * d_total + d] = grad_h; grad_ah[(2 * (n_total + n) + h) * d_total + d] = grad_h; } //--- Backpropagate through px_time and px_var (influenced by b_time and b_var) grad_px_time[n * d_total + d] = grad_hidden1 * b_time[n]; grad_px_var[n * d_total + d] = grad_hidden2 * b_var[n]; if(d == 0) { grad_b_time[n] = 0; grad_b_var[n] = 0; } BarrierLoc //--- Sum gradients over all d for b_time and b_var grad_b_time[n] += grad_hidden1 * px_time[n * d_total + d]; grad_b_var[n] += grad_hidden2 * px_var[n * d_total + d]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PScan(__global const float* A, __global const float* X, __global const float* H, __global float* X_out) { const size_t idx = get_local_id(0); const size_t dim = get_global_id(1); const size_t L = get_local_size(0); const size_t D = get_global_size(1); const int num_steps = (int)log2((float)L); //--- __local float local_A[1024]; __local float local_X[1024]; __local float local_H[1024]; //--- Load data to local memory int offset = dim + idx * D; local_A[idx] = A[offset]; local_X[idx] = X[offset]; local_H[idx] = H[offset]; BarrierLoc //--- Scan //--- for(int step = 0; step < num_steps; step++) { int halfT = L >> (step + 1); if(idx < halfT) { int base = idx * 2; local_X[base + 1] += local_A[base + 1] * local_X[base]; local_X[base + 1] *= local_H[base + 1]; local_A[base + 1] *= local_A[base]; } BarrierLoc } //--- Save result X_out[offset] = local_X[idx]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PScan_CalcHiddenGradient(__global const float* A, __global float* grad_A, __global const float* X, __global float* grad_X, __global const float* H, __global float* grad_H, __global const float* grad_X_out) { const size_t idx = get_local_id(0); const size_t dim = get_global_id(1); const size_t L = get_local_size(0); const size_t D = get_global_size(1); const int num_steps = (int)log2((float)L); //--- __local float local_A[1024]; __local float local_X[1024]; __local float local_H[1024]; __local float local_grad_X[1024]; __local float local_grad_A[1024]; __local float local_grad_H[1024]; //--- Load data to local memory int offset = idx * D + dim; local_A[idx] = A[offset]; local_X[idx] = X[offset]; local_H[idx] = H[offset]; local_grad_X[idx] = grad_X_out[offset]; local_grad_A[idx] = 0.0f; local_grad_H[idx] = 0.0f; BarrierLoc //--- Reverse Scan (Backward) //--- for(int step = num_steps - 1; step >= 0; step--) { int halfT = L >> (step + 1); if(idx < halfT) { int base = idx * 2; // Compute gradients float grad_next = local_grad_X[base + 1] * local_H[base + 1]; local_grad_H[base + 1] = local_grad_X[base + 1] * local_X[base]; local_grad_A[base + 1] = local_grad_X[base + 1] * local_X[base]; local_grad_X[base] += local_A[base + 1] * grad_next; } BarrierLoc } //--- Save gradients grad_A[offset] = local_grad_A[idx]; grad_X[offset] = local_grad_X[idx]; grad_H[offset] = local_grad_H[idx]; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DiagMatMult(__global const float *diag, __global const float *matr, __global float *result, int activation) { size_t row = get_global_id(0); size_t col = get_local_id(1); size_t var = get_global_id(2); size_t rows = get_global_size(0); size_t cols = get_local_size(1); //--- __local float local_diag[1]; if(cols == 0) local_diag[0] = diag[row + var * rows]; BarrierLoc //--- int shift = (row + var * rows) * cols + col; //--- float res = local_diag[0] * matr[shift]; //--- result[shift] = fActivation(res, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DiagMatMultGrad(__global const float *diag, __global float *grad_diag, __global const float *matr, __global float *grad_matr, __global const float *grad_result) { size_t row = get_global_id(0); size_t col = get_local_id(1); size_t var = get_global_id(2); size_t rows = get_global_size(0); size_t cols = get_local_size(1); size_t vars = get_global_size(2); //--- __local float local_diag[LOCAL_ARRAY_SIZE]; if(cols == 0) local_diag[0] = diag[row + var * rows]; BarrierLoc //--- int shift = (row + var * rows) * cols + col; //--- float grad = grad_result[shift]; float inp = matr[shift]; //--- grad_matr[shift] = IsNaNOrInf(local_diag[0] * grad, 0); BarrierLoc //--- int loc = col % LOCAL_ARRAY_SIZE; //--- for(int c = 0; c < cols; c += LOCAL_ARRAY_SIZE) { if(c <= col && (c + LOCAL_ARRAY_SIZE) > col) { if(c == 0) local_diag[loc] = IsNaNOrInf(grad * inp, 0); else local_diag[loc] += IsNaNOrInf(grad * inp, 0); } BarrierLoc } //--- int count = min(LOCAL_ARRAY_SIZE, (int)cols); int ls = count; //--- do { count = (count + 1) / 2; if((col + count) < ls) { local_diag[col] += local_diag[col + count]; local_diag[col + count] = 0; } BarrierLoc } while(count > 1); //--- if(col == 0) grad_diag[row + var * rows] = IsNaNOrInf(local_diag[0], 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TopKgates(__global const float *inputs, __global const float *noises, __global float *gates, const uint k) { size_t idx = get_local_id(0); size_t var = get_global_id(1); size_t window = get_local_size(0); size_t vars = get_global_size(1); //--- const int shift_logit = var * 2 * window + idx; const int shift_std = shift_logit + window; const int shift_gate = var * window + idx; //--- float logit = IsNaNOrInf(inputs[shift_logit], MIN_VALUE); float noise = IsNaNOrInf(noises[shift_gate], 0); if(noise != 0) { noise *= fActivation(inputs[shift_std], 3); logit += IsNaNOrInf(noise, 0); } //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- const uint ls = min((uint)window, (uint)LOCAL_ARRAY_SIZE); uint bigger = 0; float max_logit = logit; //--- Top K for(int i = 0; i < window; i += ls) { if(idx >= i && idx < (i + ls)) temp[idx % ls] = logit; BarrierLoc for(int i1 = 0; (i1 < min((int)ls, (int)(window - i)) && bigger <= k); i1++) { if(temp[i1] > logit) bigger++; if(temp[i1] > max_logit) max_logit = temp[i1]; } BarrierLoc } //--- if(bigger <= k) gates[shift_gate] = logit - max_logit; else gates[shift_gate] = MIN_VALUE; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TopKgatesGrad(__global const float *inputs, __global float *grad_inputs, __global const float *noises, __global const float *gates, __global float *grad_gates) { size_t idx = get_global_id(0); size_t var = get_global_id(1); size_t window = get_global_size(0); size_t vars = get_global_size(1); //--- const int shift_logit = var * 2 * window + idx; const int shift_std = shift_logit + window; const int shift_gate = var * window + idx; //--- float grad = IsNaNOrInf(grad_gates[shift_gate], 0); grad_inputs[shift_logit] = grad; //--- float noise = IsNaNOrInf(noises[shift_gate], 0); if(noise == 0) { grad_inputs[shift_std] = 0; return; } //--- grad *= noise; grad_inputs[shift_std] = Deactivation(grad, fActivation(inputs[shift_std], 3), 3); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MaskByDistance(__global const float *buf_real, __global const float *buf_imag, __global float *mask, const int dimension ) { const size_t main = get_global_id(0); const size_t slave = get_local_id(1); const int total = (int)get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; int ls = min((int)total, (int)LOCAL_ARRAY_SIZE); //--- const int shift_main = main * dimension; const int shift_slave = slave * dimension; const int shift_mask = main * total + slave; //--- calc distance float dist = 0; if(main != slave) { //--- for(int d = 0; d < dimension; d++) { float delta = ComplexAbs((float2)(buf_real[shift_main + d], buf_imag[shift_main + d])) - ComplexAbs((float2)(buf_real[shift_slave + d], buf_imag[shift_slave + d])); dist += delta * delta; } dist = sqrt(dist); } //--- Look Max //--- for(int i = 0; i < total; i += ls) { if(i <= slave && (i + ls) > slave) Temp[slave % ls] = fmax((i == 0 ? 0 : Temp[slave % ls]), IsNaNOrInf(dist, 0)); BarrierLoc } //--- int count = ls; do { count = (count + 1) / 2; if(slave < count && (slave + count) < ls) { if(Temp[slave] < Temp[slave + count]) Temp[slave] = Temp[slave + count]; Temp[slave + count] = 0; } BarrierLoc } while(count > 1); //--- Normalize if(Temp[0] > 0) dist /= Temp[0]; //--- result mask[shift_mask] = 1 - IsNaNOrInf(dist, 1); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MaskAttention(__global const float *q, ///<[in] Matrix of Querys __global const float *kv, ///<[in] Matrix of Keys __global float *scores, ///<[out] Matrix of Scores __global const float *masks, ///<[in] Mask Matrix __global float *out, ///<[out] Matrix of attention const int dimension, ///< Dimension of Key const int heads_kv ) { //--- init const int q_id = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_local_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_k = dimension * (2 * heads_kv * k + h_kv); const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv); const int shift_s = kunits * (q_id * heads + h) + k; const float mask = IsNaNOrInf(masks[q_id * kunits + k], 0); const uint ls = min((uint)kunits, (uint)LOCAL_ARRAY_SIZE); float koef = sqrt((float)dimension); if(koef < 1) koef = 1; __local float temp[LOCAL_ARRAY_SIZE]; //--- Score float score = 0; if(mask != 0) { for(int d = 0; d < dimension; d++) score += IsNaNOrInf(q[shift_q + d] * kv[shift_k + d], 0); score = IsNaNOrInf(exp(score / koef * mask), 0); } //--- sum of exp float sum = LocalSum(score, 1, temp); //--- score if(sum > 0) score /= sum; scores[shift_s] = score; //--- out for(int d = 0; d < dimension; d++) { float val = LocalSum(kv[shift_v + d] * score, 1, temp); if(k == 0) out[shift_q + d] = val; BarrierLoc } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MaskAttentionGradients(__global const float *q, __global float *q_g, __global const float *kv, __global float *kv_g, __global const float *scores, __global const float *gradient, const int kunits, const int heads_kv ) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (q_id * heads + h) * kunits; const int shift_g = h * dimension + d; float koef = sqrt((float)dimension); if(koef < 1) koef = 1; //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v = q_id; v < kunits; v += qunits) { float grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v; for(int g = 0; g < qunits; g++) grad += gradient[shift_g + dimension * (hq - h + g * heads)] * scores[shift_score + g * step_score]; } int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d; kv_g[shift_v] = grad; } } //--- Calculating Query's gradients float grad = 0; float out_g = IsNaNOrInf(gradient[shift_g + q_id * dimension], 0); int shift_val = (heads_kv + h_kv) * dimension + d; int shift_key = h_kv * dimension + d; //--- for(int k = 0; (k < kunits && out_g != 0); k++) { float sc_g = 0; float sc = scores[shift_s + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_s + v] * out_g * kv[shift_val + 2 * v * heads_kv * dimension] * ((float)(k == v) - sc); grad += sc_g * kv[shift_key + 2 * k * heads_kv * dimension]; } q_g[shift_q] = grad / koef; //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k = q_id; k < kunits; k += qunits) { int shift_k = dimension * (2 * heads_kv * k + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq++) { int shift_score = hq * kunits + k; float val = kv[shift_k + heads_kv * dimension]; for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] * val * ((float)(k == v) - sc); grad += sc_g * q[dimension * (h + scr * heads) + d]; } } kv_g[shift_k] = grad / koef; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardMultWinConv(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_o, __global const int *windows_in, const int inputs, const int windows_total, const int window_out, const int activation ) { const size_t i = get_global_id(0); const size_t v = get_global_id(1); const size_t outputs = get_global_size(0); //--- const int id = i % (window_out * windows_total); //--- int step = 0; int shift_in = 0; int shift_weight = 0; int window_in = 0; int window = 0; //--- for(int w = 0; w < windows_total; w++) { int win = windows_in[w]; step += win; if(((w + 1) * window_out) < id) { shift_in = step; window_in = win; shift_weight += (win + 1) * window_out; } } //--- int steps = (int)(i / (window_out * windows_total)); shift_in += steps * step + v * inputs; shift_weight += (id % window_out) * (window_in + 1); float sum = matrix_w[shift_weight + window_in]; float inp = 0.0f; //--- for(int w = 0; w < window_in; w++) if((shift_in + w) < inputs) { inp = IsNaNOrInf(matrix_i[shift_in + w], 0.0f); if(inp == 0.0f) continue; sum += IsNaNOrInf(inp * matrix_w[shift_weight + w], 0.0f); } //--- matrix_o[v * outputs + i] = fActivation(sum, activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_gr /// Kernel of the Convolution neuron to transfer gradient /// to previous layer (#CNeuronConvOCL) //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientMultWinConv(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_ig, __global const float *matrix_og, __global const int *windows_in, const int outputs, const int windows_total, const int window_out, const int activation ) { const size_t i = get_global_id(0); const size_t v = get_global_id(1); const size_t inputs = get_global_size(0); //--- int step = 0; //--- for(int w = 0; w < windows_total; w++) step += windows_in[w]; //--- int steps = (int)(i / step); int id = i % step; int window = 0; int before = 0; int window_in = 0; //--- for(int w = 0; w < windows_total; w++) { window_in = windows_in[w]; if((before + window_in) >= id) break; window = w + 1; before += window_in; } //--- int shift_weight = (before + window) * window_out + id - before; int shift_out = (steps * windows_total + window) * window_out + v * outputs; float sum = 0; //--- for(int w = 0; w < window_out; w++) { float grad = IsNaNOrInf(matrix_og[shift_out + w], 0.0f); if(grad == 0.0f) continue; sum += IsNaNOrInf(grad * matrix_w[shift_weight + w * (window_in + 1)], 0); } //--- matrix_ig[v * inputs + i] = Deactivation(sum, matrix_i[v * inputs + i], activation); } //+------------------------------------------------------------------+ ///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating /// Weights Calculation kernel /// Describes the process of Adam optimization weights for the Convolution /// Neuron (#CNeuronConvOCL). //+------------------------------------------------------------------+ __kernel void UpdateWeightsMultWinConvAdam(__global float *matrix_w, __global const float *matrix_og, __global const float *matrix_i, __global float *matrix_m, __global float *matrix_v, __global const int *windows_in, const int windows_total, const int window_out, const int inputs, const int outputs, const float l, const float b1, const float b2 ) { const size_t i = get_global_id(0); // weight shift const size_t v = get_local_id(1); // variable const size_t variables = get_local_size(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- int step_out = window_out * windows_total; //--- int step_in = 0; int shift_in = 0; int shift_out = 0; int window = 0; int number_w = 0; //--- for(int w = 0; w < windows_total; w++) { int win = windows_in[w]; if((step_in + w)*window_out <= i && (step_in + win + w + 1)*window_out > i) { shift_in = step_in; shift_out = (step_in + w + 1) * window_out; window = win; number_w = w; } step_in += win; } bool bias = ((i - (shift_in + number_w) * window_out) % (window + 1) == window); int t = (i - (shift_in + number_w) * window_out) / (window + 1); shift_out += t + v * outputs; shift_in += (i - (shift_in + number_w) * window_out) % (window + 1) + v * inputs; //--- float grad = 0; int total = (inputs + step_in - 1) / step_in; //--- for(int t = 0; t < total; t++) { int sh_out = t * step_out + shift_out; if(bias && sh_out < outputs) { grad += IsNaNOrInf(matrix_og[sh_out], 0); continue; } //--- int sh_in = t * step_in + shift_in; if(sh_in >= inputs) break; float grad_out = IsNaNOrInf(matrix_og[sh_out], 0.0f); if(grad_out == 0.0f) continue; float inp = IsNaNOrInf(matrix_i[sh_in], 0.0f); if(inp == 0.0f) continue; grad += IsNaNOrInf(grad_out * inp, 0); } //--- sum grad = LocalSum(grad, 1, temp); //--- if(v == 0) { float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0); float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f); float weight = matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0); matrix_w[i] = weight; matrix_m[i] = mt; matrix_v[i] = vt; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MaskAttentionComplex(__global const float2* __attribute__((aligned(8)))q, ///<[in] Matrix of Querys __global const float2* __attribute__((aligned(8)))kv, ///<[in] Matrix of Keys __global float *scores, ///<[out] Matrix of Scores __global const float *masks, ///<[in] Mask Matrix __global float2* __attribute__((aligned(8)))out, ///<[out] Matrix of attention const int dimension, ///< Dimension of Key const int heads_kv ) { //--- init const int q_id = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int kunits = get_local_size(1); const int heads = get_global_size(2); //--- const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h); const int shift_k = dimension * (2 * heads_kv * k + h_kv); const int shift_v = dimension * (2 * heads_kv * k + heads_kv + h_kv); const int shift_s = kunits * (q_id * heads + h) + k; const float mask = IsNaNOrInf(masks[shift_s], 0); const uint ls = min((uint)kunits, (uint)LOCAL_ARRAY_SIZE); float2 koef = (float2)(fmax((float)sqrt((float)dimension), (float)1), 0); __local float2 temp[LOCAL_ARRAY_SIZE]; //--- Score float score = 0; float2 score2 = (float2)0; if(ComplexAbs(mask) >= 0.01f) { for(int d = 0; d < dimension; d++) score2 += IsNaNOrInf2(ComplexMul(q[shift_q + d], kv[shift_k + d]), (float2)0); score = IsNaNOrInf(ComplexAbs(ComplexExp(ComplexDiv(score, koef))) * mask, 0); } //--- sum of exp //--- for(int i = 0; i < kunits; i += ls) { if(k >= i && k < (i + ls)) temp[k % ls].x = (i == 0 ? 0 : temp[k % ls].x) + score; BarrierLoc } //--- uint count = ls; //--- do { count = (count + 1) / 2; if(k < ls) temp[k].x += (k < count && (k + count) < kunits ? temp[k + count].x : 0); if(k + count < ls) temp[k + count].x = 0; BarrierLoc } while(count > 1); //--- score if(temp[0].x > 0) score = score / temp[0].x; scores[shift_s] = score; //--- out //--- for(int d = 0; d < dimension; d++) { float2 val = (score > 0 ? ComplexMul(kv[shift_v + d], (float2)(score, 0)) : (float2)0); //--- for(int i = 0; i < kunits; i += ls) { if(k >= i && k < (i + ls)) temp[k % ls] = (i == 0 ? (float2)0 : temp[k % ls]) + val; BarrierLoc } //--- uint count = ls; //--- do { count = (count + 1) / 2; if(k < ls) temp[k] += (k < count && (k + count) < kunits ? temp[k + count] : (float2)0); if((k + count) < ls) temp[k + count] = (float2)0; BarrierLoc } while(count > 1); //--- if(k == 0) out[shift_q + d] = temp[0]; BarrierLoc } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MaskAttentionGradientsComplex(__global const float2* __attribute__((aligned(8)))q, __global float2* __attribute__((aligned(8)))q_g, __global const float2* __attribute__((aligned(8)))kv, __global float2* __attribute__((aligned(8)))kv_g, __global const float *scores, __global const float *mask, __global float *mask_g, __global const float2* __attribute__((aligned(8)))gradient, const int kunits, const int heads_kv ) { //--- init const int q_id = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (q_id * heads + h) * kunits; const int shift_g = h * dimension + d; float2 koef = (float2)(fmax(sqrt((float)dimension), (float)1), 0); //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v = q_id; v < kunits; v += qunits) { float2 grad = (float2)0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v; for(int g = 0; g < qunits; g++) { float sc = IsNaNOrInf(scores[shift_score + g * step_score], 0); if(sc > 0) grad += ComplexMul(gradient[shift_g + dimension * (hq - h + g * heads)], (float2)(sc, 0)); } } int shift_v = dimension * (2 * heads_kv * v + heads_kv + h) + d; kv_g[shift_v] = grad; } } //--- Calculating Query's gradients float2 grad = 0; float2 out_g = IsNaNOrInf2(gradient[shift_g + q_id * dimension], (float2)0); int shift_val = (heads_kv + h_kv) * dimension + d; int shift_key = h_kv * dimension + d; //--- for(int k = 0; (k < kunits && ComplexAbs(out_g) != 0); k++) { float2 sc_g = 0; float2 sc = (float2)(scores[shift_s + k], 0); for(int v = 0; v < kunits; v++) sc_g += IsNaNOrInf2(ComplexMul( ComplexMul((float2)(scores[shift_s + v], 0), out_g * kv[shift_val + 2 * v * heads_kv * dimension]), ((float2)(k == v, 0) - sc)), (float2)0); float m = mask[shift_s + k]; mask_g[shift_s + k] = IsNaNOrInf(sc.x / m * sc_g.x + sc.y / m * sc_g.y, 0); grad += IsNaNOrInf2(ComplexMul(sc_g, kv[shift_key + 2 * k * heads_kv * dimension]), (float2)0); } q_g[shift_q] = IsNaNOrInf2(ComplexDiv(grad, koef), (float2)0); //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k = q_id; k < kunits; k += qunits) { int shift_k = dimension * (2 * heads_kv * k + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq++) { int shift_score = hq * kunits + k; float2 val = IsNaNOrInf2(kv[shift_k + heads_kv * dimension], (float2)0); for(int scr = 0; scr < qunits; scr++) { float2 sc_g = (float2)0; int shift_sc = scr * kunits * heads; float2 sc = (float2)(IsNaNOrInf(scores[shift_sc + k], 0), 0); if(ComplexAbs(sc) == 0) continue; for(int v = 0; v < kunits; v++) sc_g += IsNaNOrInf2( ComplexMul( ComplexMul((float2)(scores[shift_sc + v], 0), gradient[shift_g + scr * dimension]), ComplexMul(val, ((float2)(k == v, 0) - sc))), (float2)0); grad += IsNaNOrInf2(ComplexMul(sc_g, q[(h + scr * heads) * dimension + d]), (float2)0); } } kv_g[shift_k] = IsNaNOrInf2(ComplexDiv(grad, koef), (float2)0); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CSLSTM_FeedForward(__global const float4* __attribute__((aligned(16))) concatenated, __global float *memory, __global float *output) { uint id = (uint)get_global_id(0); uint total = (uint)get_global_size(0); // hidden size uint idv = (uint)get_global_id(1); uint total_v = (uint)get_global_size(1); // variables //--- uint shift = id + total * idv; float4 concat = concatenated[shift]; //--- float fg_s = fActivation(concat.s0, ActFunc_SIGMOID); float fg = 1 - fActivation(1 - 1 / (fg_s * fg_s), ActFunc_TANH); float ig = fActivation(fActivation(concat.s1, ActFunc_SIGMOID), ActFunc_TANH); float nc = fActivation(concat.s2, ActFunc_TANH); float og = fActivation(concat.s3, ActFunc_SIGMOID); float mem = IsNaNOrInf(memory[shift] * fg + ig * nc, 0); float out = IsNaNOrInf(og * fActivation(mem, ActFunc_TANH), 0); //--- memory[shift] = mem; output[shift] = out; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CSLSTM_CalcHiddenGradient(__global const float4* __attribute__((aligned(16))) concatenated, // Input from forward pass (W*x + U*h + b) __global float4* __attribute__((aligned(16))) grad_concat, // Output: gradients w.r.t. gate pre-activations __global const float* memory, // Updated memory (after forward pass) __global const float* grad_output // dL/dOutput from the next layer ) { uint id = get_global_id(0); // Index within sequences uint total = get_global_size(0); // Total size of sequences uint idv = get_global_id(1); // Index over independent univariate sequences (e.g., features or channels) in a multivariate time series uint shift = id + total * idv; // Flattened index //--- float4 concat = concatenated[shift]; // Pre-activation values for all 4 gates // --- Forward reconstruction of gates --- float fg_s = fActivation(concat.s0, ActFunc_SIGMOID); float fg = 1.0f - fActivation(1.0f - 1.0f / (fg_s * fg_s), ActFunc_TANH); // Forget gate (ft) float ig_s = fActivation(concat.s1, ActFunc_SIGMOID); float ig = fActivation(ig_s, ActFunc_TANH); // Input gate (it) float nc = fActivation(concat.s2, ActFunc_TANH); // Candidate (ct~) float og = fActivation(concat.s3, ActFunc_SIGMOID); // Output gate (ot) float mem = memory[shift]; // New memory state (ct) float mem_t = fActivation(mem, ActFunc_TANH); // tanh(ct) // --- Reconstruct previous memory state (t-1) --- float prev_mem = IsNaNOrInf((mem - ig * nc) / fg, 0); // --- Gradients computation --- float out_g = grad_output[shift]; float og_g = Deactivation(out_g * mem_t, og, ActFunc_SIGMOID); float mem_g = Deactivation(out_g * og, mem_t, ActFunc_TANH); float nc_g = Deactivation(mem_g * ig, nc, ActFunc_TANH); float ig_g = Deactivation(Deactivation(mem_g * nc, ig, ActFunc_TANH), ig_s, ActFunc_SIGMOID); // ∂L/∂fg = ∂L/∂ct * mem_(t-1) float fg_g = mem_g * prev_mem; // Derivative of the complex forget gate: // f(z) = 1 - tanh(1 - 1 / σ(z)^2) float fg_s_g = 2 / (fg_s * fg_s * fg_s) * Deactivation(-fg_g, fg, ActFunc_TANH); fg_g = Deactivation(fg_s_g, fg_s, ActFunc_SIGMOID); // --- Write back gradients --- grad_concat[shift] = (float4)(fg_g, ig_g, nc_g, og_g); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ProbAttentionQeuryImp(__global const float* querys, __global const float2* __attribute__((aligned(8))) keys_values, __global const float* index_keys, __global float* querys_imp, const int dimension ) { const size_t id_q = get_global_id(0); const size_t total_q = get_global_size(0); const size_t ind_k = get_local_id(1); const size_t total_ind = get_local_size(1); const size_t id_h = get_global_id(2); const size_t total_h = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE][2]; const int ls = min((int)total_ind, (int)LOCAL_ARRAY_SIZE); //--- const int shift_q = dimension * (id_q * total_h + id_h); const int id_k = index_keys[ind_k * total_h + id_h]; const int shift_k = dimension * (id_k * total_h + id_h); //--- float sum = 0; //--- for(int d = 0; d < dimension; d++) sum += IsNaNOrInf(querys[shift_q + d] * keys_values[shift_k + d].s0, 0); //--- int id_t = ind_k % ls; //--- for(int i = 0; i < total_ind; i += ls) { if(i <= ind_k || (i + ls) > ind_k) { temp[id_t][0] = IsNaNOrInf((i == 0 ? 0 : temp[id_t][0]) + sum, 0); temp[id_t][1] = (i == 0 ? IsNaNOrInf(sum, MIN_VALUE) : fmax(temp[id_t][1], IsNaNOrInf(sum, MIN_VALUE))); BarrierLoc } } int count = ls; //--- do { count = (count + 1) / 2; if(ind_k < count && (ind_k + count) < ls) { temp[ind_k][0] += temp[ind_k + count][0]; temp[ind_k + count][0] = 0; temp[ind_k][1] = fmax(temp[ind_k + count][1], temp[ind_k][1]); } BarrierLoc } while(count > 1); if(ind_k == 0) querys_imp[id_q * total_h + id_h] = IsNaNOrInf(temp[0][1] - temp[0][0] / total_ind, MIN_VALUE); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TopKImportanceToIndex(__global const float* importance, __global float* indexes, const int top_k ) { const size_t id_q = get_global_id(0); const size_t total_q = get_global_size(0); const size_t id_h = get_global_id(1); const size_t total_h = get_global_size(1); //--- float imp = importance[id_q * total_h + id_h]; int pos = 0; //--- for(int i = 0; i < total_q; i++) { if(i == id_q) continue; float val = importance[i * total_h + id_h]; if(val > imp || (i < id_q && val >= imp)) pos++; if(pos >= top_k) break; } //--- if(pos < top_k) indexes[pos * total_h + id_h] = (float)id_q; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void QIndexAttention(__global const float *q, ///<[in] Matrix of Querys __global const float2* kv, ///<[in] Matrix of Keys __global float *scores, ///<[out] Matrix of Scores __global const float *indexes, ///<[in] Querys Indexes __global float *out, ///<[out] Matrix of attention const int dimension, ///< Dimension of Key const int heads_kv ) { //--- init const int ind_q = get_global_id(0); const int k = get_local_id(1); const int h = get_global_id(2); const int total_q = get_global_size(0); const int total_k = get_local_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int q_id = (int)(indexes[ind_q * heads + h] + 0.001f); const int shift_q = dimension * (q_id * heads + h); const int shift_kv = dimension * (heads_kv * k + h_kv); const int shift_s = total_k * (ind_q * heads + h) + k; //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Score float score = 0; if(q_id >= 0) { //--- for(int d = 0; d < dimension; d++) score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0); } else score = MIN_VALUE; //--- norm score score = IsNaNOrInf(exp(score - LocalMax(score, 1, temp)), 0); score = IsNaNOrInf(score / LocalSum(score, 1, temp), 0); scores[shift_s] = score; BarrierLoc //--- out for(int d = 0; d < dimension; d++) { float val = LocalSum(kv[shift_kv + d].s1 * score, 1, temp); if(k == 0) out[dimension * (ind_q * heads + h) + d] = temp[0]; BarrierLoc } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void QIndexAttentionGradients(__global const float* q, __global float* q_g, __global const float2* kv, __global float2* kv_g, __global const float* indexes, __global const float* scores, __global const float* gradient, const int kunits, const int heads_kv ) { //--- init const int ind_q = get_global_id(0); const int d = get_global_id(1); const int h = get_global_id(2); const int qunits = get_global_size(0); const int dimension = get_global_size(1); const int heads = get_global_size(2); const int h_kv = h % heads_kv; const int q_id = (int)(indexes[ind_q * heads + h] + 0.001f); const int shift_q = dimension * (q_id * heads + h) + d; const int shift_s = (ind_q * heads + h) * kunits; const int shift_g = h * dimension + d; //--- Calculating Value's gradients int step_score = kunits * heads; if(h < heads_kv) { //--- for(int v = ind_q; v < kunits; v += qunits) { float grad = 0; for(int hq = h; hq < heads; hq += heads_kv) { int shift_score = hq * kunits + v; for(int g = 0; g < qunits; g++) grad += IsNaNOrInf(gradient[shift_g + dimension * (hq - h + g * heads)], 0) * scores[shift_score + g * step_score]; } int shift_v = dimension * (heads_kv * v + h) + d; kv_g[shift_v].s1 = IsNaNOrInf(grad, 0); } } //--- Calculating Query's gradients float grad = 0; float out_g = IsNaNOrInf(gradient[shift_g + ind_q * dimension], 0); int shift_kv = h_kv * dimension + d; //--- for(int k = 0; (k < kunits && out_g != 0); k++) { float sc_g = 0; float sc = scores[shift_s + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_s + v] * out_g * kv[shift_kv + v * heads_kv * dimension].s1 * ((float)(k == v) - sc); grad += sc_g * kv[shift_kv + k * heads_kv * dimension].s0; } q_g[shift_q] = grad; //--- Calculating Key's gradients if(h < heads_kv) { //--- for(int k = ind_q; k < kunits; k += qunits) { int shift_k = dimension * (heads_kv * k + h_kv) + d; grad = 0; for(int hq = h; hq < heads; hq++) { int shift_score = hq * kunits + k; float val = kv[shift_k + heads_kv * dimension].s1; for(int scr = 0; scr < qunits; scr++) { float sc_g = 0; int shift_sc = scr * kunits * heads; float sc = scores[shift_sc + k]; if(sc == 0) continue; for(int v = 0; v < kunits; v++) sc_g += scores[shift_sc + v] * gradient[shift_g + scr * dimension] * val * ((float)(k == v) - sc); grad += IsNaNOrInf(sc_g * q[(hq + (int)(indexes[scr * heads + hq] + 0.001f) * heads) * dimension + d], 0); } } kv_g[shift_k].s0 = IsNaNOrInf(grad, 0); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void TSPositonEncoder(__global const float2* __attribute__((aligned(8))) data, __global const float* time, __global float2* __attribute__((aligned(8))) output, __global const float* period ) { const int id = get_global_id(0); const int freq = get_global_id(1); const int p = get_global_id(2); const int total = get_global_size(0); const int freqs = get_global_size(1); const int periods = get_global_size(2); //--- const int shift = id * freqs + freq; const float2 d = data[shift * periods + p]; const float t = time[id] / period[p]; float val = M_PI_F * t * pow(2.0f, freq + 1); //--- output[shift * periods + p] = (float2)(d.s0 + sin(val), d.s1 + cos(val)); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardMultWinConvWPad(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_o, __global const int *windows_in, const int inputs, const int step, const int window_out, const int activation ) { const size_t id = get_global_id(0); const size_t id_w = get_global_id(1); const size_t v = get_global_id(2); const size_t outputs = get_global_size(0); const size_t windows_total = get_global_size(1); //--- int window_in = windows_in[id_w]; int mid_win = window_in / 2; int shift_in = id * step - mid_win; int shift_in_var = v * inputs; int shift_weight = 0; //--- for(int w = 0; w < id_w; w++) shift_weight += (windows_in[w] + 1) * window_out; //--- for(int w_out = 0; w_out < window_out; w_out++) { float sum = matrix_w[shift_weight + window_in]; //--- for(int w = 0; w < window_in; w++) if((shift_in + w) >= 0 && (shift_in + w) < inputs) sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in + w] * matrix_w[shift_weight + w], 0); //--- int shift_out = (v * outputs + id) * window_out + w_out; matrix_o[shift_out] = fActivation(sum, activation); shift_weight += window_in + 1; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientMultWinConvWPad(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_ig, __global const float *matrix_og, __global const int *windows_in, const int outputs, const int step, const int window_out, const int filters, const int activation ) { const size_t id_x = get_global_id(0); const size_t loc = get_local_id(1); const size_t v = get_global_id(2); const size_t inputs = get_global_size(0); const size_t size_loc = get_local_size(1); const size_t windows_total = filters / window_out; //--- __local float temp[LOCAL_ARRAY_SIZE]; const uint ls = min((uint)size_loc, (uint)LOCAL_ARRAY_SIZE); //--- float grad = 0; for(int id_loc = loc; id_loc < filters; id_loc += size_loc) { const size_t id_win = id_loc / window_out; const size_t id_f = id_loc % window_out; int window_in = windows_in[id_win]; int shift_weight = id_f * (window_in + 1); for(int w = 0; w < id_win; w++) shift_weight += (windows_in[w] + 1) * window_out; //--- int shift_out = max((int)((id_x - window_in) / step), 0); //--- int mid_win = (window_in + 1) / 2; for(int out = shift_out; out < outputs; out++) { int shift_in = out * step - mid_win; if(shift_in > id_x) break; int shift_w = id_x - shift_in; if(shift_w >= window_in) continue; int shift_g = ((v * outputs + out) * windows_total + id_win) * window_out + id_f; grad += IsNaNOrInf(matrix_w[shift_w + shift_weight] * matrix_og[shift_g], 0); } } //--- grad = LocalSum(grad, 1, temp); //--- if(loc == 0) matrix_ig[v * inputs + id_x] = Deactivation(grad, matrix_i[v * inputs + id_x], activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsMultWinConvAdamWPad(__global float *matrix_w, __global const float *matrix_og, __global const float *matrix_i, __global float *matrix_m, __global float *matrix_v, __global const int *windows_in, const int windows_total, const int window_out, const int inputs, const int step, const int outputs, const float l, const float b1, const float b2 ) { const size_t i = get_global_id(0); // weight shift const size_t v = get_local_id(1); // variable const size_t variables = get_local_size(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; const uint ls = min((uint)variables, (uint)LOCAL_ARRAY_SIZE); //--- int step_out = window_out * windows_total; //--- int shift_before = 0; int window = 0; int number_w = 0; for(int w = 0; w < windows_total; w++) { int win = windows_in[w]; if(shift_before <= i && (win + 1)*window_out > (i - shift_before)) { window = win; number_w = w; } else shift_before += (win + 1) * window_out; } //--- int shift_in = (i - shift_before) % (window + 1); int shift_in_var = v * inputs; bool bias = (shift_in == window); int mid_win = (window + 1) / 2; int id_f = (i - shift_before) / (window + 1); int shift_out = number_w * window_out + id_f; int shift_out_var = v * outputs * step_out; //--- float grad = 0; if(!bias) { for(int out = 0; out < outputs; out++) { int in = out * step - mid_win + shift_in; if(in >= inputs) break; if(in < 0) continue; //--- grad += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out * step_out] * matrix_i[shift_in_var + in], 0); } } else { for(int out = 0; out < outputs; out++) grad += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out * step_out], 0); } //--- sum for(int s = 0; s < (int)variables; s += ls) { if(v >= s && v < (s + ls)) temp[v % ls] = (s == 0 ? 0 : temp[v % ls]) + grad; BarrierLoc } //--- uint count = ls; do { count = (count + 1) / 2; if(v < count && (v + count) < ls) { temp[v] += temp[v + count]; temp[v + count] = 0; } BarrierLoc } while(count > 1); //--- if(v == 0) { grad = temp[0]; float mt = IsNaNOrInf(clamp(b1 * matrix_m[i] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0); float vt = IsNaNOrInf(clamp(b2 * matrix_v[i] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f); float weight = matrix_w[i] + IsNaNOrInf(l * mt / sqrt(vt), 0); matrix_w[i] = weight; matrix_m[i] = mt; matrix_v[i] = vt; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ConcatDiff(__global const float* data, __global float* output, const int step) { const size_t i = get_global_id(0); const size_t v = get_local_id(1); const size_t inputs = get_local_size(0); const size_t variables = get_local_size(1); //--- const int shift = i * variables; const float d = data[shift + v]; float diff = 0; if(step > 0 && (i + step) < inputs) diff = IsNaNOrInf(d - data[shift + step * variables + v], 0); //--- output[2 * shift + v] = d; output[2 * shift + v + variables] = diff; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardMaskMultWinConv(__global const float *matrix_w, __global const float *matrix_i, __global const float *masks, __global float *matrix_o, const int inputs, const int window_in, const int windows_total, const int activation ) { const size_t u = get_global_id(0); const size_t w = get_global_id(1); const size_t v = get_global_id(2); const size_t units = get_global_size(0); const size_t window_out = get_global_size(1); const size_t variables = get_global_size(2); //--- const int shift_in = u * window_in * windows_total; const int shift_in_var = v * units * window_in * windows_total; const int shift_out = (u + v * units) * window_out + w; const int shift_mask = (u + v * units) * windows_total; const int shift_weight = (v * window_out * windows_total + w) * (window_in + 1); const int step_weight = window_out * (window_in + 1); //--- float sum = 0; for(int w_in = 0; w_in < windows_total; w_in++) { float m = IsNaNOrInf(masks[shift_mask + w_in], 0); if(m < FLT_EPSILON) continue; const int shift_in_loc = shift_in + w_in * window_in; const int shift_weight_loc = shift_weight + w_in * step_weight; for(int i = 0; i < window_in; i++) if((shift_in_loc + i) < (inputs / variables)) sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc + i], 0) * matrix_w[shift_weight_loc + i] * m; sum += matrix_w[shift_weight_loc + window_in] * m; } //--- matrix_o[shift_out] = fActivation(sum, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientMaskMultWinConv(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_ig, __global const float *matrix_og, __global const float *masks, __global float *masks_g, const int outputs, const int window_in, const int window_out, const int activation ) { const size_t u = get_global_id(0); const size_t w_in = get_global_id(1); const size_t v = get_global_id(2); const size_t units = get_global_size(0); const size_t windows_total = get_global_size(1); const size_t variables = get_global_size(2); //--- const int shift_in = (u + v * units) * window_in * windows_total + w_in * window_in; const int shift_out = u * window_out; const int shift_out_var = v * units * window_out; const int shift_mask = (u + v * units) * windows_total + w_in; const int shift_weight = (v * window_out * windows_total + w_in * window_out) * (window_in + 1); //--- const float m = IsNaNOrInf(masks[shift_mask], 0); for(int i = 0; i < window_in; i++) { float sum = 0; if(m >= FLT_EPSILON) { for(int out = 0; out < window_out; out++) { if((shift_out + out) >= (outputs / variables)) continue; sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] * matrix_w[shift_weight + out * (window_in + 1) + i] * m, 0); } } matrix_ig[shift_in + i] = Deactivation(sum, matrix_i[shift_in + i], activation); } //--- float sum = 0; for(int out = 0; out < window_out; out++) { int shift_weight_loc = out * (window_in + 1) + shift_weight; float temp = matrix_w[shift_weight_loc + window_in]; for(int i = 0; i < window_in; i++) temp += IsNaNOrInf(matrix_i[shift_in + i], 0) * matrix_w[shift_weight_loc + i]; sum += IsNaNOrInf(temp * matrix_og[shift_out_var + shift_out + out], 0); } masks_g[shift_mask] = IsNaNOrInf(sum, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsMaskMultWinConvAdam(__global float *matrix_w, __global const float *matrix_og, __global const float *matrix_i, __global const float *masks, __global float *matrix_m, __global float *matrix_v, const int windows_total, const int inputs, const int outputs, const float l, const float b1, const float b2 ) { const size_t id_in = get_global_id(0); // input shift const size_t id_out = get_global_id(1); // filter shift const size_t id_v = get_global_id(2); // variable const size_t window_in = get_global_size(0) / windows_total - 1; const size_t window_out = get_global_size(1); const size_t variables = get_global_size(2); //--- const int w_id = id_in / (window_in + 1); const int shift_in = id_in - w_id; const int step_in = window_in * windows_total; const int units = outputs / window_out; const int shift_in_var = id_v * inputs; const int shift_out_var = id_v * outputs; const int shift_mask_var = id_v * units * windows_total; const int shift_weight = ((id_v * windows_total + w_id) * window_out + id_out) * (window_in + 1) + id_in % (window_in + 1); const bool bias = (id_in % (window_in + 1) == window_in); //--- float grad = 0; for(int u = 0; u < units; u++) { const int shift_in_loc = shift_in + u * step_in; if(shift_in < inputs) continue; float m = IsNaNOrInf(masks[shift_mask_var + u * windows_total + w_id], 0); if(m < FLT_EPSILON) continue; float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0)); grad += IsNaNOrInf(inp * m * matrix_og[shift_out_var + u * window_out + id_out], 0); } float mt = IsNaNOrInf(clamp(b1 * matrix_m[shift_weight] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0); float vt = IsNaNOrInf(clamp(b2 * matrix_v[shift_weight] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f); float weight = matrix_w[shift_weight] + IsNaNOrInf(l * mt / sqrt(vt), 0); matrix_w[shift_weight] = weight; matrix_m[shift_weight] = mt; matrix_v[shift_weight] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MainFreq(__global const float* freq_r, __global const float* freq_im, __global float *main_freq, int dimension ) { if(dimension <= 0) return; //--- size_t n = get_global_id(0); const int shift = n * dimension; //--- float max_f = 0; float max_id = 0; float energy; //--- for(int i = 1; i < dimension; i++) { float2 freq = (float2)(freq_r[shift + i], freq_im[shift + i]); energy = ComplexAbs(freq); if(max_f < energy) { max_f = energy; max_id = i + 1; } } main_freq[n] = max_id; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FeedForwardAdaptConv(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_o, __global const float *main_freq, const int inputs, const int window_in, const int activation ) { const size_t u = get_global_id(0); const size_t f = get_global_id(1); const size_t v = get_global_id(2); const size_t units = get_global_size(0); const size_t filters = get_global_size(1); const size_t variables = get_global_size(2); //--- const int freq = main_freq[v]; int window = (inputs / variables + freq - 1) / freq; const int step = (int)(inputs / variables + units + 1) / (units + 2); if(window < step) window = (int)((step + window - 1) / window) * window; if(window > window_in) window = window_in; //--- const int shift_in = (u < (units - 1) ? u * step : inputs / variables - window); const int shift_in_var = v * inputs / variables; const int shift_out = (u + v * units) * filters + f; const int shift_weight = (v * filters + f) * (window_in + 1); //--- float sum = matrix_w[shift_weight + window_in]; for(int i = 0; i < window; i++) if((shift_in + i) < (inputs / variables)) sum += IsNaNOrInf(matrix_i[shift_in_var + shift_in + i], 0) * matrix_w[shift_weight + i]; //--- matrix_o[shift_out] = fActivation(sum, activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradientAdaptConv(__global const float *matrix_w, __global const float *matrix_i, __global float *matrix_ig, __global const float *matrix_og, __global const float *main_freq, const int outputs, const int window_in, const int window_out, const int activation ) { const size_t inp = get_global_id(0); const size_t v = get_global_id(1); const size_t inputs = get_global_size(0); const size_t variables = get_global_size(1); //--- const int units = outputs / (window_out * variables); const int freq = main_freq[v]; int window = (inputs / variables + freq - 1) / freq; const int step = (int)(inputs + units + 1) / (units + 2); if(window < step) window = (int)((step + window - 1) / window) * window; if(window > window_in) window = window_in; //--- const int shift_in = v * inputs + inp; int u = inp / step; int shift_out_var = v * (outputs / variables); int shift_weight_var = (v * window_out) * (window_in + 1); //--- float sum = 0; while(u * step <= inp && u < (units - 1)) { int pos = inp - u * step; if(pos >= window) { u++; continue; } int shift_out = u * window_out; int shift_weight = pos + shift_weight_var; for(int out = 0; out < window_out; out++) { if((shift_out + out) >= (outputs / variables)) continue; sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] * matrix_w[shift_weight + out * (window_in + 1)], 0); } u++; } if(inp >= (inputs - window)) { int pos = inp + window - inputs; int shift_out = (units - 1) * window_out; int shift_weight = pos + shift_weight_var; for(int out = 0; out < window_out; out++) { if((shift_out + out) >= (outputs / variables)) continue; sum += IsNaNOrInf(matrix_og[shift_out_var + shift_out + out] * matrix_w[shift_weight + out * (window_in + 1)], 0); } } matrix_ig[shift_in] = Deactivation(sum, matrix_i[shift_in], activation); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void UpdateWeightsAdaptConvAdam(__global float *matrix_w, __global const float *matrix_og, __global const float *matrix_i, __global float *matrix_m, __global float *matrix_v, __global float *main_freq, const int inputs, const int outputs, const float l, const float b1, const float b2 ) { const size_t id_in = get_global_id(0); // input shift const size_t id_out = get_global_id(1); // filter shift const size_t id_v = get_global_id(2); // variable const size_t window_in = get_global_size(0) - 1; const size_t window_out = get_global_size(1); const size_t variables = get_global_size(2); //--- const int units = outputs / (window_out * variables); const int freq = main_freq[id_v]; int window = (inputs / variables + freq - 1) / freq; const int step = (int)(inputs / variables + units + 1) / (units + 2); if(window < step) window = (int)((step + window - 1) / window) * window; if(window > window_in) window = window_in; //--- if(id_in != window_in && id_in >= window) return; //--- const int shift_in_var = id_v * inputs / variables; const int shift_out_var = id_v * outputs / variables; const int shift_weight = (id_v * window_out + id_out) * (window_in + 1) + id_in; const bool bias = (id_in == window_in); //--- float grad = 0; for(int u = 0; u < (units - 1); u++) { const int shift_in_loc = id_in + u * step; if(shift_in_loc >= (inputs / variables)) continue; float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0)); grad += IsNaNOrInf(inp * matrix_og[shift_out_var + u * window_out + id_out], 0); } { const int shift_in_loc = id_in + inputs / variables - window; if(shift_in_loc < (inputs / variables)) { float inp = (bias ? 1 : IsNaNOrInf(matrix_i[shift_in_var + shift_in_loc], 0)); grad += IsNaNOrInf(inp * matrix_og[shift_out_var + (units - 1) * window_out + id_out], 0); } } float mt = IsNaNOrInf(clamp(b1 * matrix_m[shift_weight] + (1 - b1) * grad, -1.0e5f, 1.0e5f), 0); float vt = IsNaNOrInf(clamp(b2 * matrix_v[shift_weight] + (1 - b2) * (grad * grad), 1.0e-6f, 1.0e6f), 1.0e-6f); float weight = matrix_w[shift_weight] + IsNaNOrInf(l * mt / sqrt(vt), 0); matrix_w[shift_weight] = weight; matrix_m[shift_weight] = mt; matrix_v[shift_weight] = vt; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void RoPE(__global const float2* __attribute__((aligned(8))) inputs, __global const float2* __attribute__((aligned(8))) position_emb, __global float2* __attribute__((aligned(8))) outputs ) { const size_t id_d = get_global_id(0); // dimension const size_t id_u = get_global_id(1); // unit const size_t id_v = get_global_id(2); // variable const size_t dimension = get_global_size(0); const size_t units = get_global_size(1); const size_t variables = get_global_size(2); //--- const int shift_in = (id_v * units + id_u) * dimension + id_d; const int shift_pos = id_u * dimension + id_d; const float2 inp = inputs[shift_in]; const float2 pe = position_emb[shift_pos]; //--- float2 result = 0; result.s0 = inp.s0 * pe.s0 - inp.s1 * pe.s1; result.s1 = inp.s0 * pe.s1 + inp.s1 * pe.s0; //--- outputs[shift_in] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcHiddenGradRoPE(__global float2* __attribute__((aligned(8))) inputs_gr, __global const float2* __attribute__((aligned(8))) position_emb, __global const float2* __attribute__((aligned(8))) outputs_gr ) { const size_t id_d = get_global_id(0); // dimension const size_t id_u = get_global_id(1); // unit const size_t id_v = get_global_id(2); // variable const size_t dimension = get_global_size(0); const size_t units = get_global_size(1); const size_t variables = get_global_size(2); //--- const int shift_in = (id_v * units + id_u) * dimension + id_d; const int shift_pos = id_u * dimension + id_d; const float2 grad = outputs_gr[shift_in]; const float2 pe = position_emb[shift_pos]; //--- float2 grad_x; grad_x.s0 = grad.s0 * pe.s0 + grad.s1 * pe.s1; grad_x.s1 = grad.s1 * pe.s0 - grad.s0 * pe.s1; //--- inputs_gr[shift_in] = grad_x; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DifMatrix(__global const float *matrix1, ///<[in] First matrix __global const float *matrix2, ///<[in] Second matrix __global float *matrix_out, ///<[out] Output matrix const float multiplyer, ///< Multiplyer for output const int shift_in1, ///< Shift for input 1 const int shift_in2, ///< Shift for input 2 const int shift_out ///< Shift for output ) { const int i = get_global_id(0); const int d = get_global_id(1); const int step = get_global_size(0); const int dimension = get_global_size(1); //--- int index = i * dimension + d; matrix_out[i * shift_out + index] = IsNaNOrInf((matrix1[i * shift_in1 + index] - matrix2[i * shift_in2 + index]) * multiplyer, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DifMatrixGrad(__global float *matrix1, ///<[in] First matrix __global float *matrix2, ///<[in] Second matrix __global const float *matrix_out, ///<[out] Output matrix const float multiplyer, ///< Multiplyer for output const int shift_in1, ///< Shift for input 1 const int shift_in2, ///< Shift for input 2 const int shift_out ///< Shift for output ) { const int i = get_global_id(0); const int d = get_global_id(1); const int step = get_global_size(0); const int dimension = get_global_size(1); //--- int index = i * dimension + d; float grad = IsNaNOrInf(matrix_out[i * shift_out + index] * multiplyer, 0); matrix1[i * shift_in1 + index] = grad; matrix2[i * shift_in2 + index] = -grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void IdentitySumMatrix(__global const float *matrix_in, __global float *matrix_out, const float multiplyer, const int shift_in, const int shift_out ) { const int i = get_global_id(0); const int d = get_global_id(1); const int step = get_global_size(0); const int dimension = get_global_size(1); //--- int index = i * dimension + d; matrix_out[i * shift_out + index] = IsNaNOrInf(((int)(i == d) + matrix_in[i * shift_in + index]) * multiplyer, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void IdentityDifMatrix(__global const float *matrix_in, __global float *matrix_out, const float multiplyer, const int shift_in, const int shift_out ) { const int i = get_global_id(0); const int d = get_global_id(1); const int step = get_global_size(0); const int dimension = get_global_size(1); //--- int index = i * dimension + d; matrix_out[i * shift_out + index] = IsNaNOrInf(((int)(i == d) - matrix_in[i * shift_in + index]) * multiplyer, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void IdentityDifMatrixGrad(__global float *matrix_in, __global const float *matrix_out, const float multiplyer, const int shift_in, const int shift_out ) { const int i = get_global_id(0); const int d = get_global_id(1); const int step = get_global_size(0); const int dimension = get_global_size(1); //--- int index = i * dimension + d; matrix_in[i * shift_in + index] = IsNaNOrInf(-multiplyer * matrix_out[i * shift_out + index], 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SumVecMatrix(__global const float *vector_in, __global const float *matrix_in, __global float *matrix_out, const float multiplyer, ///< Multiplyer for output const int shift_in1, ///< Shift for input 1 const int shift_in2, ///< Shift for input 2 const int shift_out ///< Shift for output ) { const int r = get_global_id(0); const int c = get_global_id(1); const int v = get_global_id(2); const int rows = get_global_size(0); const int cols = get_global_size(1); const int variables = get_global_size(2); //--- int flat_m = RCtoFlat(r, c, rows, cols, v); int flat_v = RCtoFlat(0, c, 1, cols, v); matrix_out[flat_m] = IsNaNOrInf((vector_in[flat_v] + matrix_in[flat_m]) * multiplyer, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SumVecMatrixGrad(__global float *vector_in, __global float *matrix_in, __global const float *matrix_out, const float multiplyer, ///< Multiplyer for output const int shift_in1, ///< Shift for input 1 const int shift_in2, ///< Shift for input 2 const int shift_out ///< Shift for output ) { const int r = get_global_id(0); const int c = get_global_id(1); const int v = get_global_id(2); const int rows = get_global_size(0); const int cols = get_global_size(1); const int variables = get_global_size(2); //--- int flat_m = RCtoFlat(r, c, rows, cols, v); int flat_v = RCtoFlat(0, c, 1, cols, v); //--- float grad = IsNaNOrInf(matrix_out[flat_m] * multiplyer, 0); matrix_in[flat_m] = grad; //--- if(r == 0) { for(int i = 1; i < rows; i++) { flat_m += cols; grad += IsNaNOrInf(matrix_out[flat_m] * multiplyer, 0); } vector_in[flat_v] = IsNaNOrInf(grad / rows, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void InterpolationAttention(__global const float* matrix_in, __global const float* W, __global const float* A, __global const float* GL, __global float* Adj, __global float* H, __global float* Atten, __global float* matrix_out, const int dimension ) { const size_t i = get_global_id(0); const size_t j = get_local_id(1); const size_t total = get_global_size(0); const size_t total_loc = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift_i = i * dimension; const int shift_j = j * dimension; const int shift_adj = i * total_loc + j; //--- float adj = 0; for(int d = 0; d < dimension; d++) adj += IsNaNOrInf(GL[shift_i + d] * GL[shift_j + d], 0); adj = max(IsNaNOrInf(adj, 0), 0.0f); adj = LocalSoftMax(adj, 1, Temp); Adj[shift_adj] = adj; adj += (float)(i == j); //--- for(int id_h = 0; id_h < dimension; id_h += total_loc) { if(j >= (dimension - id_h)) break; float h = 0; for(int w = 0; w < dimension; w++) h += IsNaNOrInf(matrix_in[shift_i + w] * W[(id_h + j) * dimension + w], 0); H[shift_i + id_h + j] = h; BarrierLoc } float e = 1e-12f; if(adj > 0) { e = 0; for(int a = 0; a < dimension; a++) e += IsNaNOrInf(H[shift_i + a] * A[a], 0) + IsNaNOrInf(H[shift_j + a] * A[dimension + a], 0); } e = LocalSoftMax(e, 1, Temp); Atten[shift_adj] = e; //--- Scale output by attention for(int d = 0; d < dimension; d += total_loc) { if(j >= (dimension - d)) break; float out = 0; int shift_h = d + j; int shift_att = i * total_loc; int shift_out = i * dimension + shift_h; for(int n = 0; n < total_loc; n++) out += IsNaNOrInf(H[shift_h + n * dimension] * Atten[shift_att + n], 0); matrix_out[shift_out] = fActivation(out, ActFunc_LReLU); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void InterpolationAttentionGrad(__global const float* matrix_in, __global float* matrix_in_gr, __global const float* W, __global float* W_gr, __global const float* A, __global float* A_gr, __global const float* GL, __global float* GL_gr, __global float* Adj, __global float* H, __global float* H_gr, __global float* Atten, __global float* matrix_out_gr, const int dimension ) { const size_t i = get_global_id(0); const size_t j = get_local_id(1); const size_t total = get_global_size(0); const size_t total_loc = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift_i = i * dimension; const int shift_j = j * dimension; const int shift_adj = i * total_loc + j; //--- H Gradient for(int d = 0; d < dimension; d += total_loc) { if(j >= (dimension - d)) break; float h_grad = 0; int shift_h = shift_i + d + j; int shift_att = i; int shift_out = d + j; for(int n = 0; n < total_loc; n++) { float gr = matrix_out_gr[shift_out + n * dimension]; h_grad += IsNaNOrInf( Deactivation(gr, gr, ActFunc_LReLU) * Atten[shift_att + n * total_loc], 0); } H_gr[shift_h] = h_grad; BarrierLoc } //--- Attention Gradient float att_grad = 0; for(int d = 0; d < dimension; d++) { float gr = matrix_out_gr[shift_i + d]; gr = Deactivation(gr, gr, ActFunc_LReLU); att_grad += IsNaNOrInf(gr * H[shift_j + d], 0); } att_grad = LocalSoftMaxGrad(Atten[shift_adj], att_grad, 1, Temp); //--- Add H Gradient for(int d = 0; d < dimension; d++) { float h_grad = att_grad * A[d]; h_grad = LocalSum(h_grad, 1, Temp); if(j == 0) H_gr[shift_i + d] += h_grad; h_grad = att_grad * A[dimension + d]; h_grad = LocalSum(h_grad, 1, Temp); if(j == 0) H_gr[shift_j + d] += h_grad; float a_grad = att_grad * H[shift_i + d]; a_grad = LocalSum(a_grad, 1, Temp); A_gr[d] += a_grad; a_grad = att_grad * H[shift_j + d]; a_grad = LocalSum(a_grad, 1, Temp); A_gr[dimension + d] += a_grad; } //--- Inputs' Gradient for(int d = 0; d < dimension; d += total_loc) { if(j >= (dimension + d)) break; float grad = 0; for(int w = 0; w < dimension; w++) grad += IsNaNOrInf(H_gr[shift_i + w] * W[(d + j) + dimension * w], 0); matrix_in_gr[shift_i + d + j] = grad; BarrierLoc } //--- Adj Gradient float grad = LocalSoftMaxGrad(Adj[shift_adj], att_grad, 1, Temp); for(int d = 0; d < dimension; d++) { GL_gr[shift_i + d] += IsNaNOrInf(grad * GL[shift_j + d], 0); GL_gr[shift_j + d] += IsNaNOrInf(grad * GL[shift_i + d], 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PeriodNorm(__global const float* inputs, __global float2* mean_stdevs, __global float* outputs, const int total_inputs ) { const size_t i = get_global_id(0); const size_t p = get_local_id(1); const size_t v = get_global_id(2); const size_t windows = get_global_size(0); const size_t period = get_local_size(1); const size_t variable = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift_i = i * period + p; const int shift_v = v * total_inputs; const int shift_ms = v * windows + i; //--- float val = 0; if((shift_i) < total_inputs) val = IsNaNOrInf(inputs[shift_v + shift_i], 0); float mean = IsNaNOrInf(LocalSum(val, 1, Temp) / period, 0); val -= mean; BarrierLoc float stdev = LocalSum(val * val, 1, Temp) / period; stdev = IsNaNOrInf(sqrt(stdev), 1); //--- mean_stdevs[shift_ms] = (float2)(mean, stdev); if((shift_i) < total_inputs) outputs[shift_v + shift_i] = IsNaNOrInf(val / stdev, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PeriodNormGrad(__global const float* inputs, __global float* inputs_gr, __global const float2* mean_stdevs, __global const float2* mean_stdevs_gr, __global const float* outputs, __global const float* outputs_gr, const int total_inputs ) { const size_t i = get_global_id(0); const size_t p = get_local_id(1); const size_t v = get_global_id(2); const size_t windows = get_global_size(0); const size_t period = get_local_size(1); const size_t variable = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift_i = i * period + p; const int shift_v = v * total_inputs; const int shift_ms = v * windows + i; //--- float inp = 0; float inp_gr = 0; float out = 0; float out_gr = 0; const float2 mean_stdev = mean_stdevs[shift_ms]; const float2 mean_stdev_gr = mean_stdevs_gr[shift_ms]; if((shift_i) < total_inputs) { inp = IsNaNOrInf(inputs[shift_v + shift_i], 0); out = IsNaNOrInf(outputs[shift_v + shift_i], 0); out_gr = IsNaNOrInf(outputs_gr[shift_v + shift_i], 0); } float mean_gr = LocalSum(out_gr, 1, Temp) / period + IsNaNOrInf(mean_stdev.x, 0); BarrierLoc float stdev_gr = out * LocalSum(IsNaNOrInf(out * out_gr, 0), 1, Temp) / period + IsNaNOrInf(mean_stdev.y, 0); inp_gr = (out_gr - mean_gr - stdev_gr) / IsNaNOrInf(mean_stdev.y, 1); //--- if((shift_i) < total_inputs) inputs_gr[shift_v + shift_i] = IsNaNOrInf(inp_gr, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AdaptSpatialNorm(__global const float* inputs, __global const float* attention, __global float2* mean_stdevs, __global float* outputs ) { const size_t i = get_global_id(0); const size_t a = get_local_id(1); const size_t v = get_global_id(2); const size_t total_inputs = get_global_size(0); const size_t total_local = get_local_size(1); const size_t variables = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift_v = v * total_inputs; const int shift_out = shift_v + i; //--- float mean = 0, stdev = 0; for(uint l = 0; l < variables; l += total_local) { const int shift_at = v * variables + (a + l); float val = IsNaNOrInf(inputs[(a + l) * total_inputs + i], 0); float att = IsNaNOrInf(attention[shift_at], 0); mean += val * att; stdev += val * val * att; } mean = LocalSum(mean, 1, Temp); BarrierLoc stdev = LocalSum(stdev, 1, Temp); //--- if(a == 0) { stdev -= mean * mean; stdev = IsNaNOrInf(sqrt(stdev), 1); if(stdev <= 0) stdev = 1; mean_stdevs[shift_out] = (float2)(mean, stdev); outputs[shift_out] = IsNaNOrInf((inputs[shift_out] - mean) / stdev, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AdaptSpatialNormGrad(__global const float* inputs, __global float* inputs_gr, __global const float* attention, __global float* attention_gr, __global const float2* mean_stdevs, __global const float2* mean_stdevs_gr, __global const float* outputs_gr, const uint total_inputs ) { const size_t i = get_global_id(0); // main const size_t loc = get_local_id(1); // local to sum const size_t v = get_global_id(2); // variable const size_t total_main = get_global_size(0); // total const size_t total_loc = get_local_size(1); // local dimension const size_t variables = get_global_size(2); // total variables //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- Inputs gradient { int shift_in = v * total_inputs + i; float grad = 0; if(i < total_inputs) { float x = IsNaNOrInf(inputs[shift_in], 0); for(int l = 0; l < variables; l += total_loc) { if((l + loc) >= variables) break; int shift_out = i + (l + loc) * total_inputs; float att = IsNaNOrInf(attention[(l + loc) * variables + v], 0); float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0); float2 ms = mean_stdevs[shift_out]; float2 ms_gr = mean_stdevs_gr[shift_out]; float dy = (1 - att) * (1 / ms.y - (x - ms.x) * att * x / (ms.y * ms.y * ms.y)); float dmean = IsNaNOrInf(ms_gr.x * att, 0); float dstd = IsNaNOrInf(ms_gr.y * x * (att - att * att) / ms.y, 0); grad += IsNaNOrInf(dy * out_gr + dmean + dstd, 0); } } grad = LocalSum(grad, 1, Temp); if(loc == 0 && i < total_inputs) inputs_gr[shift_in] = grad; BarrierLoc } //--- Attention gradient { int shift_att = v * variables + i; float grad = 0; if(i < variables) { float att = IsNaNOrInf(attention[shift_att], 0); for(int l = 0; l < total_inputs; l += total_loc) { if((l + loc) >= total_inputs) break; int shift_out = (l + loc) + v * total_inputs; int shift_in = (l + loc) + i * total_inputs; float x = IsNaNOrInf(inputs[shift_in], 0); float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0); float2 ms = mean_stdevs[shift_out]; float2 ms_gr = mean_stdevs_gr[shift_out]; float dy = -x / ms.y - (x - ms.x) * x * x * (1 - 2 * att) / (2 * ms.y * ms.y * ms.y); float dmean = IsNaNOrInf(ms_gr.x * x, 0); float dstd = IsNaNOrInf(ms_gr.y * x * x * (1 - 2 * att) / (2 * ms.y), 0); grad += IsNaNOrInf(dy * out_gr + dmean + dstd, 0); } } grad = LocalSum(grad, 1, Temp); if(loc == 0 && i < variables) attention_gr[shift_att] = grad; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AttentNorm(__global const float* inputs, __global const float* attention, __global float* means, __global float* stdevs, __global float* outputs, const int total_inputs, const int segment_size ) { const size_t s = get_global_id(0); const size_t i = get_local_id(1); const size_t v = get_global_id(2); const size_t total_segments = get_global_size(0); const size_t total_local = get_local_size(1); const size_t variables = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int shift = v * total_inputs + s * segment_size + i; //--- float mean = 0, stdev = 0; float val = 0; for(uint l = 0; l < segment_size; l += total_local) { if((l + i) >= segment_size || (s * segment_size + l + i) >= total_inputs) break; float val_l = IsNaNOrInf(inputs[shift + l], 0); if(l == 0) val = val_l; float att = IsNaNOrInf(attention[v * segment_size + l + i], 0); mean += val_l * att; stdev += val_l * val_l * att; } mean = LocalSum(mean, 1, Temp); BarrierLoc stdev = LocalSum(stdev, 1, Temp); //--- stdev -= mean * mean; stdev = IsNaNOrInf(sqrt(stdev), 1); if(stdev <= 0) stdev = 1; //--- if(i == 0) { int shift_ms = v * total_segments + s; means[shift_ms] = mean; stdevs[shift_ms] = stdev; } for(uint l = 0; l < segment_size; l += total_local) { if((l + i) >= segment_size || (s * segment_size + l + i) >= total_inputs) break; if(l > 0) val = inputs[shift + l]; outputs[shift + l] = IsNaNOrInf((val - mean) / stdev, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AttentNormGrad(__global const float* inputs, __global float* inputs_gr, __global const float* attention, __global float* attention_gr, __global const float* means, __global const float* stdevs, __global const float* means_gr, __global const float* outputs_gr, const int total_inputs, const int segment_size ) { const size_t i = get_global_id(0); // main const size_t loc = get_local_id(1); // local to sum const size_t v = get_global_id(2); // variable const size_t total_main = get_global_size(0); // total const size_t total_loc = get_local_size(1); // local dimension const size_t variables = get_global_size(2); // total variables //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- Inputs gradient { const int s = i / segment_size; const int shift_in = v * total_inputs + i; const int shift_ms = v * segment_size + s; float grad = 0; if(loc == 0 && i < total_inputs) { Temp[0] = IsNaNOrInf(inputs[shift_in], 0); Temp[1] = IsNaNOrInf(means[shift_ms], 0); Temp[2] = IsNaNOrInf(stdevs[shift_ms], 1); Temp[3] = IsNaNOrInf(means_gr[shift_ms], 0); Temp[4] = IsNaNOrInf(attention[(v - s) * segment_size + i], 0); } BarrierLoc if(i < total_inputs) { float x = Temp[0]; float mean = Temp[1]; float stdev = Temp[2]; float mean_gr = Temp[3]; float att = Temp[4]; for(int l = 0; l < segment_size; l += total_loc) { if((l + loc) >= segment_size || (i * segment_size + loc + l) >= total_inputs) break; float out_gr = IsNaNOrInf(outputs_gr[v * total_inputs + s * segment_size + loc + l], 0); bool same = (i - s * segment_size) == (loc + l); float xl = x; if(!same) xl = IsNaNOrInf(inputs[v * total_inputs + s * segment_size + loc + l], 0); float dy = ((int)same - att) * (1 / stdev - (xl - mean) * att * x / (stdev * stdev * stdev)); float dmean = (same ? IsNaNOrInf(mean_gr * att, 0) : 0); grad += IsNaNOrInf(dy * out_gr + dmean, 0); } } grad = LocalSum(grad, 1, Temp); if(loc == 0 && i < total_inputs) inputs_gr[shift_in] = grad; BarrierLoc } //--- Attention gradient { float grad = 0; int shift_att = v * segment_size + i; if(i < segment_size) { float att = IsNaNOrInf(attention[shift_att], 0); for(int l = 0; l < total_inputs; l += total_loc) { if((l + loc) >= total_inputs) break; int shift_out = (l + loc) + v * total_inputs; int s = (l + loc) / segment_size; int shift_in = v * total_inputs + s * segment_size + i; float x = IsNaNOrInf(inputs[shift_in], 0); float out_gr = IsNaNOrInf(outputs_gr[shift_out], 0); float mean = means[v * segment_size + s]; float stdev = stdevs[v * segment_size + s]; float mean_gr = means_gr[v * segment_size + s]; bool same = (i - s * segment_size) == (loc + l); float xl = x; if(!same) xl = IsNaNOrInf(inputs[shift_out], 0); float dy = -x / stdev - (xl - mean) * x * x * (1 - 2 * att) / (2 * stdev * stdev * stdev); float dmean = IsNaNOrInf(mean_gr * x, 0); grad += IsNaNOrInf(dy * out_gr + dmean, 0); } } grad = LocalSum(grad, 1, Temp); if(loc == 0 && i < segment_size) attention_gr[shift_att] = grad; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ChebStep(__global const float* support, __global float* outputs, const int step ) { const size_t l = get_local_id(0); const size_t r = get_global_id(1); const size_t c = get_global_id(2); const size_t total_l = get_local_size(0); const size_t total_r = get_global_size(1); const size_t total_c = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- if(step <= 0 || total_r != total_c) return; //--- if(step <= 3) { const float diag = (r == c ? 1.0f : 0.0f); if(l == 0) outputs[RCtoFlat(r, c, total_r, total_c, 0)] = diag; if(step < 2) return; if(l == 0) { const float s = IsNaNOrInf(support[RCtoFlat(r, c, total_r, total_c, 0)], 0); outputs[RCtoFlat(r, c, total_r, total_c, 1)] = s; } if(step < 3) return; float out = 0; for(int t = 0; t < total_c; t += total_l) { const float s1 = IsNaNOrInf(support[RCtoFlat(r, t + l, total_r, total_c, 0)], 0); const float s2 = IsNaNOrInf(support[RCtoFlat(t + l, c, total_r, total_c, 0)], 0); out += IsNaNOrInf(s1 * s2, 0); } out = 2 * LocalSum(out, 0, Temp); if(l == 0) { out -= diag; outputs[RCtoFlat(r, c, total_r, total_c, 2)] = IsNaNOrInf(out, 0); } return; } //--- float out = 0; for(int t = 0; t < total_c; t += total_l) { if((t + l) >= total_c) continue; const float s1 = IsNaNOrInf(support[RCtoFlat(r, t + l, total_r, total_c, 0)], 0); const float s2 = IsNaNOrInf(outputs[RCtoFlat(t + l, c, total_r, total_c, step - 2)], 0); out += IsNaNOrInf(s1 * s2, 0); } out = 2 * LocalSum(out, 0, Temp); if(l == 0) { out -= IsNaNOrInf(outputs[RCtoFlat(r, c, total_r, total_c, step - 3)], 0); outputs[RCtoFlat(r, c, total_r, total_c, step - 1)] = IsNaNOrInf(out, 0); } return; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ChebStepGrad(__global const float* support, __global float* support_g, __global const float* outputs, __global float* outputs_g, const int step ) { const size_t l = get_local_id(0); const size_t r = get_global_id(1); const size_t c = get_global_id(2); const size_t total_l = get_local_size(0); const size_t total_r = get_global_size(1); const size_t total_c = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- if(step < 1 || total_r != total_c) return; //--- if(step >= 2) { float grad = IsNaNOrInf(outputs_g[RCtoFlat(r, c, total_r, total_c, step)], 0); if(l == 0) outputs_g[RCtoFlat(r, c, total_r, total_c, step - 2)] -= grad; //--- support grad grad = 0; for(int t = 0; t < total_c; t += total_l) { if((t + l) >= total_c) continue; const float s2 = IsNaNOrInf(outputs[RCtoFlat(c, t + l, total_r, total_c, step - 2)], 0); grad += IsNaNOrInf(outputs_g[RCtoFlat(r, t + l, total_r, total_c, step)] * s2, 0); } grad = LocalSum(grad, 0, Temp); if(l == 0) outputs_g[RCtoFlat(r, c, total_r, total_c, 1)] += grad; BarrierLoc //--- T(k-1) grad grad = 0; for(int t = 0; t < total_c; t += total_l) { if((t + l) >= total_c) continue; const float s2 = IsNaNOrInf(support[RCtoFlat(t + l, r, total_r, total_c, 0)], 0); grad += IsNaNOrInf(outputs_g[RCtoFlat(t + l, c, total_r, total_c, step)] * s2, 0); } grad = LocalSum(grad, 0, Temp); if(l == 0) outputs_g[RCtoFlat(r, c, total_r, total_c, step - 1)] += grad; } //--- if(step <= 2) { if(l == 0) support_g[RCtoFlat(r, c, total_r, total_c, 0)] = outputs_g[RCtoFlat(r, c, total_r, total_c, 1)]; return; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SignificantNeighborsSampling(__global const float *data, __global const float *candidates, __global const float *random_cands, __global float *neighbors, const int dimension ) { const size_t main = get_global_id(0); const size_t slave = get_local_id(1); const int total_main = (int)get_global_size(0); const int total_slave = (int)get_local_size(1); //--- __local int Idx[LOCAL_ARRAY_SIZE]; __local float Temp[LOCAL_ARRAY_SIZE]; const int ls = min(total_slave, (int)LOCAL_ARRAY_SIZE); //--- const int shift_main = RCtoFlat(main, 0, total_main, dimension, 0); int cand = (int)candidates[slave]; int rand_cand = (int)random_cands[slave]; //--- duplicate check if(rand_cand == cand) rand_cand = -1; //--- Look in candidates for(int l = 0; l < total_slave; l += ls) { if(slave >= l && slave < (l + ls)) Idx[slave - l] = cand; BarrierLoc for(int i = 0; i < ls; i++) { if(i >= (slave - l)) continue; if(cand == Idx[i]) cand = -1; if(rand_cand == Idx[i]) rand_cand = -1; } BarrierLoc } //--- Look in random candidates for(int l = 0; l < total_slave; l += ls) { if(slave >= l && slave < (l + ls)) Idx[slave - l] = rand_cand; BarrierLoc for(int i = 0; i < ls; i++) { if(i >= (slave - l)) continue; if(cand == Idx[i]) cand = -1; if(rand_cand == Idx[i]) rand_cand = -1; } BarrierLoc } //--- const int shift_cand = RCtoFlat(cand, 0, total_main, dimension, 0); const int shift_rand_cand = RCtoFlat(rand_cand, 0, total_main, dimension, 0); //--- calc distance float dist_cand = 0; float dist_rand_cand = 0; for(int d = 0; d < dimension; d++) { float value = IsNaNOrInf(data[shift_main + d], 0); if(main != cand && cand >= 0) { float delta = value - IsNaNOrInf(data[shift_cand + d], 0); dist_cand += delta * delta; } if(main != rand_cand && rand_cand >= 0) { float delta = value - IsNaNOrInf(data[shift_rand_cand + d], 0); dist_rand_cand += delta * delta; } } //--- calc position int cand_position = 0; int rand_position = (int)(dist_cand >= dist_rand_cand); //--- by candidates for(int l = 0; l < total_slave; l += ls) { if(slave >= l && slave < (l + ls)) Temp[slave - l] = (cand >= 0 ? IsNaNOrInf(dist_cand, -1) : -1); BarrierLoc for(int i = 0; i < ls; i++) { if(i == (slave - l)) continue; if(Temp[i] < 0) continue; if(cand >= 0) { if(Temp[i] < dist_cand) cand_position++; else if(Temp[i] < dist_cand && i < (slave - l)) cand_position++; } if(rand_cand >= 0) { if(Temp[i] < dist_rand_cand) rand_position++; else if(Temp[i] < dist_rand_cand && i < (slave - l)) rand_position++; } } BarrierLoc } //--- by random candidates for(int l = 0; l < total_slave; l += ls) { if(slave >= l && slave < (l + ls)) Temp[slave - l] = (rand_cand >= 0 ? IsNaNOrInf(dist_rand_cand, -1) : -1); BarrierLoc for(int i = 0; i < ls; i++) { if(i == (slave - l)) continue; if(Temp[i] < 0) continue; if(cand >= 0) { if(Temp[i] < dist_cand) cand_position++; else if(Temp[i] < dist_cand && i < (slave - l)) cand_position++; } if(rand_cand >= 0) { if(Temp[i] < dist_rand_cand) rand_position++; else if(Temp[i] < dist_rand_cand && i < (slave - l)) rand_position++; } } BarrierLoc } //--- result if(cand >= 0 && cand_position < total_slave) { const int shift_dist_cand = RCtoFlat(main, cand_position, total_main, total_slave, 0); neighbors[shift_dist_cand] = cand; } if(rand_cand >= 0 && rand_position < total_slave) { const int shift_dist_cand = RCtoFlat(main, rand_position, total_main, total_slave, 0); neighbors[shift_dist_cand] = rand_cand; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseMHScores(__global const float* data, __global const float* indexes, __global float* scores, const float sparse ///< [0.0 .. 1.0) coefficient of sparse ) { const int main = (int)get_global_id(0); const int slave = (int)get_local_id(1); const int head = (int)get_global_id(2); const int total_mains = (int)get_global_size(0); const int total_slaves = (int)get_local_size(1); const int total_heads = (int)get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- float value = IsNaNOrInf(data[RCtoFlat(main, head, total_mains, 2 * total_heads, 0)], 0); int slave_id = (int)indexes[RCtoFlat(main, slave, total_mains, total_slaves, 0)]; if(slave_id < total_mains && slave_id >= 0) value += IsNaNOrInf(data[RCtoFlat(slave_id, head + total_heads, total_mains, 2 * total_heads, 0)], 0); //--- const float max_value = LocalMax(value, 1, Temp); const float min_value = LocalMin(value, 1, Temp); const float threshold = (max_value - min_value) * sparse + min_value; value = (threshold <= value ? IsNaNOrInf(exp(value - max_value), 0) : 0); const float sum = LocalSum(value, 1, Temp); value = IsNaNOrInf(value / sum, 0); //--- scores[RCtoFlat(slave, head, total_slaves, total_heads, main)] = value; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseMHScoresGrad(__global float* data_gr, __global const float* indexes, __global const float* scores, __global const float* scores_gr ) { const int main = (int)get_global_id(0); const int slave = (int)get_local_id(1); const int head = (int)get_global_id(2); const int total_mains = (int)get_global_size(0); const int total_slaves = (int)get_local_size(1); const int total_heads = (int)get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const uint ls = min((uint)total_slaves, (uint)LOCAL_ARRAY_SIZE); //--- Calc grad by main { float value = IsNaNOrInf(scores[RCtoFlat(slave, head, total_slaves, total_heads, main)], 0); int slave_id = (int)indexes[RCtoFlat(main, slave, total_mains, total_slaves, 0)]; const float sc_gr = IsNaNOrInf(scores_gr[RCtoFlat(slave, head, total_slaves, total_heads, main)], 0); //--- float grad = 0; for(uint d = 0; d < total_slaves; d += ls) { if(slave >= d && slave < (d + ls)) Temp[slave - d] = IsNaNOrInf(sc_gr, 0); BarrierLoc for(uint l = 0; l < min(ls, (uint)(total_slaves - d)); l++) grad += IsNaNOrInf(Temp[l] * ((float)((d + l) == slave && slave_id == main) - value), 0); BarrierLoc } grad = LocalSum(grad, 1, Temp); if(slave == 0) data_gr[RCtoFlat(main, head, total_mains, 2 * total_heads, 0)] = grad; } //--- Calc grad by slave { float grad = 0; for(uint d = 0; d < total_mains; d++) { float value = IsNaNOrInf(scores[RCtoFlat(slave, head, total_slaves, total_heads, d)], 0); const float sc_gr = IsNaNOrInf(scores_gr[RCtoFlat(slave, head, total_slaves, total_heads, d)], 0); int slave_id = (int)indexes[RCtoFlat(d, slave, total_mains, total_slaves, 0)]; //--- float gr = IsNaNOrInf(sc_gr * ((float)(slave_id == d) - value), 0); gr = LocalSum(gr, 1, Temp); if(slave == 0) grad += gr; } if(slave == 0) data_gr[RCtoFlat(main, head + total_heads, total_mains, 2 * total_heads, 0)] = IsNaNOrInf(grad, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseMatMult(__global const float *sparse_index, __global const float *sparse_data, __global const float *full, __global float *result, const int full_rows ) { const size_t sparse_row = get_global_id(0); const size_t sparse_col = get_local_id(1); const size_t full_col = get_global_id(2); const size_t sparse_rows = get_global_size(0); const size_t sparse_cols = get_local_size(1); const size_t full_cols = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- const int shift_sparse = RCtoFlat(sparse_row, sparse_col, sparse_rows, sparse_cols, 0); const int full_row = sparse_index[shift_sparse]; const int shift_full = RCtoFlat(full_row, full_col, full_rows, full_cols, 0); //--- float res = (full_row >= 0 && full_row < full_rows ? IsNaNOrInf(sparse_data[shift_sparse] * full[shift_full], 0) : 0); res = LocalSum(res, 1, Temp); //--- if(sparse_col == 0) { const int shift_result = RCtoFlat(sparse_row, full_col, sparse_rows, full_cols, 0); result[shift_result] = res; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseMatMultGrad(__global const float *sparse_index, __global const float *sparse_data, __global float *sparse_gr, __global const float *full, __global float *full_gr, __global const float *result_gr, const int sparse_rows, const int sparse_cols, const int full_rows, const int full_cols ) { const size_t row_id = get_global_id(0); const size_t local_id = get_local_id(1); const size_t col_id = get_global_id(2); const size_t total_rows = get_global_size(0); const size_t total_local = get_local_size(1); const size_t total_cols = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- Calce sparse gradient if(row_id < sparse_rows && col_id < sparse_cols) { float grad = 0; int shift_sparse = 0; if(local_id == 0) { shift_sparse = RCtoFlat(row_id, col_id, sparse_rows, sparse_cols, 0); Temp[0] = sparse_index[shift_sparse]; } BarrierLoc uint full_row = (uint)Temp[0]; if(full_row < (uint)full_rows) for(int i = local_id; i < full_cols; i += total_local) { int shift_result = RCtoFlat(row_id, i, sparse_rows, full_cols, 0); int shift_full = RCtoFlat(full_row, i, full_rows, full_cols, 0); grad += IsNaNOrInf(result_gr[shift_result] * full[shift_full], 0); } grad = LocalSum(grad, 1, Temp); if(local_id == 0) sparse_gr[shift_sparse] = grad; } //--- Calce full gradient if(row_id < full_rows && col_id < full_cols) { float grad = 0; for(int r = 0; r < sparse_rows; r ++) { float s = 0; for(int c = local_id; c < sparse_cols; c += total_local) { int shift_sparse = RCtoFlat(r, c, sparse_rows, sparse_cols, 0); if((int)sparse_index[shift_sparse] == (int)row_id) { s = sparse_data[shift_sparse]; break; } } s = LocalSum(s, 1, Temp); if(s != 0 && local_id == 0) { int shift_result = RCtoFlat(r, col_id, sparse_rows, full_cols, 0); grad += IsNaNOrInf(s * result_gr[shift_result], 0); } } if(local_id == 0) { int shift_full = RCtoFlat(row_id, col_id, full_rows, full_cols, 0); full_gr[shift_full] = grad; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void RandomWalk(__global const float *data, __global float *inv_diag, __global float *norm, const int total_cols ) { const size_t row_id = get_global_id(0); const size_t local_id = get_local_id(1); const size_t total_rows = get_global_size(0); const size_t total_local = get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- float d = 0; for(int c = local_id; c < total_cols; c += total_local) { int shift = RCtoFlat(row_id, c, total_rows, total_cols, 0); d += IsNaNOrInf(data[shift], 0); } d = IsNaNOrInf(1.0f / (LocalSum(d, 1, Temp) + 1.0f), 1.0f); if(local_id == 0) inv_diag[row_id] = d; //--- for(int c = local_id; c < total_cols; c += total_local) { int shift = RCtoFlat(row_id, c, total_rows, total_cols, 0); norm[shift] = IsNaNOrInf(data[shift] * d, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ConcatByLabel(__global const float* data, __global const float* label, __global const float* embedding1, __global const float* embedding2, __global float *output, const int dimension_data, const int dimension_emb1, const int dimension_emb2, const int frame1, const int frame2, const int period1, const int period2 ) { const size_t row_id = get_global_id(0); const size_t col_id = get_global_id(1); const size_t buffer_id = get_global_id(2); const size_t total_rows = get_global_size(0); const size_t total_cols = get_global_size(1); const size_t total_buffers = get_global_size(2); //--- __global const float *buffer; int dimension_in, dimension_out; int shift_in, shift_out; //--- switch(total_buffers) { case 1: dimension_out = dimension_data; break; case 2: dimension_out = dimension_data + dimension_emb1; break; case 3: dimension_out = dimension_data + dimension_emb1 + dimension_emb2; break; default: return; } //--- switch(buffer_id) { case 0: buffer = data; dimension_in = dimension_data; shift_in = RCtoFlat(row_id, col_id, total_rows, dimension_in, 0); shift_out = RCtoFlat(row_id, col_id, total_rows, dimension_out, 0); break; case 1: buffer = embedding1; dimension_in = dimension_emb1; shift_in = ((int)IsNaNOrInf(label[row_id] / frame1, 0)) % period1; shift_in = RCtoFlat(shift_in, col_id, period1, dimension_in, 0); shift_out = RCtoFlat(row_id, dimension_data + col_id, total_rows, dimension_out, 0); break; case 2: buffer = embedding2; dimension_in = dimension_emb2; shift_in = ((int)IsNaNOrInf(label[row_id] / frame2, 0)) % period2; shift_in = RCtoFlat(shift_in, col_id, period2, dimension_in, 0); shift_out = RCtoFlat(row_id, dimension_data + dimension_emb1 + col_id, total_rows, dimension_out, 0); break; } //--- if(col_id < dimension_in) output[shift_out] = IsNaNOrInf(buffer[shift_in], 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ConcatByLabelGrad(__global float* data_gr, __global const float* label, __global float* embedding1_gr, __global float* embedding2_gr, __global float *output_gr, const int dimension_data, const int dimension_emb1, const int dimension_emb2, const int frame1, const int frame2, const int period1, const int period2, const int units ) { const size_t row_id = get_global_id(0); const size_t col_id = get_global_id(1); const size_t buffer_id = get_global_id(2); const size_t total_rows = get_global_size(0); const size_t total_cols = get_global_size(1); const size_t total_buffers = get_global_size(2); //--- __global float *buffer; int dimension_in, dimension_out; int shift_in, shift_out, shift_col; int period, frame, rows; //--- switch(total_buffers) { case 1: dimension_out = dimension_data; break; case 2: dimension_out = dimension_data + dimension_emb1; break; case 3: dimension_out = dimension_data + dimension_emb1 + dimension_emb2; break; default: return; } //--- switch(buffer_id) { case 0: if(col_id < dimension_data && row_id < units) { shift_in = RCtoFlat(row_id, col_id, total_rows, dimension_in, 0); shift_out = RCtoFlat(row_id, col_id, total_rows, dimension_out, 0); data_gr[shift_in] = IsNaNOrInf(output_gr[shift_out], 0); } return; case 1: rows = period1; buffer = embedding1_gr; dimension_in = dimension_emb1; shift_in = RCtoFlat(row_id, col_id, period1, dimension_in, 0); shift_col = dimension_data; period = period1; frame = frame1; break; case 2: rows = period2; buffer = embedding2_gr; dimension_in = dimension_emb2; shift_in = RCtoFlat(row_id, col_id, period2, dimension_in, 0); shift_col = dimension_data + dimension_emb1; period = period2; frame = frame2; break; } //--- if(row_id >= rows || col_id >= dimension_in) return; float grad = 0; for(uint r = 0; r < total_rows; r ++) { int row = ((int)IsNaNOrInf(label[r] / frame, 0)) % period; if(row != row_id) continue; shift_out = RCtoFlat(r, shift_col + col_id, total_rows, dimension_out, 0); grad += IsNaNOrInf(output_gr[shift_out], 0); } buffer[shift_in] = IsNaNOrInf(grad, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GlobalLocalAttention(__global const float *q, __global const float2* kv, __global float *scores, __global const float* mask, __global const float* label, __global float *out, const int dimension, const int total_kv, const int total_mask ) { //--- init const int q_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_q = get_global_size(0); const int total_local = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Score int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); if(h_id % 2 == 0) { const int shift_kv = RCtoFlat(h_id, 0, total_heads, dimension, local_id); const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, q_id); float score = 0; if(local_id < total_kv) { for(int d = 0; d < dimension; d++) score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0); } else score = MIN_VALUE; //--- norm score score = LocalSoftMax(score, 1, temp); if(local_id < total_kv) scores[shift_s] = score; //--- out for(int d = 0; d < dimension; d++) { float val = (local_id < total_kv ? kv[shift_kv + d].s1 * score : 0); val = LocalSum(val, 1, temp); if(local_id == 0) out[shift_q + d] = val; } } else { int kv_id = -1; float score = 0; int shift_kv = -1; float m = 0; const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id); if(local_id < total_mask) { const int l = RCtoFlat(q_id, local_id, total_q, total_mask, 0); kv_id = IsNaNOrInf(label[l], -1); m = IsNaNOrInf(mask[l], 0); shift_kv = RCtoFlat(h_id, 0, total_heads, dimension, kv_id); if(kv_id >= 0) for(int d = 0; d < dimension; d++) score += IsNaNOrInf(q[shift_q + d] * kv[shift_kv + d].s0, 0); else score = MIN_VALUE; } else score = MIN_VALUE; //--- norm score score = LocalSoftMax(score * m, 1, temp); if(local_id < total_mask) scores[shift_s] = score; //--- out for(int d = 0; d < dimension; d++) { float val = (kv_id >= 0 ? IsNaNOrInf(kv[shift_kv + d].s1, 0) * score : 0); val = LocalSum(val, 1, temp); if(local_id == 0) out[shift_q + d] = val; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GlobalLocalAttentionGrad(__global const float *q, __global float *q_gr, __global const float *kv, __global float *kv_gr, __global float *scores, __global const float *mask, __global float *mask_gr, __global const float *label, __global float *out_gr, const int dimension, const int total_q, const int total_kv, const int total_mask ) { //--- init const int global_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_global = get_global_size(0); const int total_local = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- if(h_id % 2 == 0) { //--- Value Gradient global_id -> v_id, local_id -> q_id for(int d = 0; d < dimension; d++) { const int shift_v = RCtoFlat(h_id, 2 * d + 1, total_heads, 2 * dimension, global_id); float grad = 0; for(int q_id = local_id; q_id < total_q; q_id += total_local) { int shift_s = RCtoFlat(h_id / 2, global_id, total_heads / 2, total_kv + total_mask, q_id); int shift_q = RCtoFlat(h_id, d, total_heads, dimension, q_id); grad += IsNaNOrInf(scores[shift_s] * out_gr[shift_q], 0); } grad = LocalSum(grad, 1, temp); kv_gr[shift_v] = grad; } //--- Query Gradient global_id -> q_id, local_id -> k_id/v_id if(global_id < total_q) { //--- 1. Score grad float grad_s = 0; const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, local_id); const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, global_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, global_id); if(local_id < total_kv) for(int d = 0; d < dimension; d++) grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0); //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); //--- 3. Query grad const int shift_k = shift_v - 1; for(int d = 0; d < dimension; d++) { float grad = 0; if(local_id < total_kv) grad = kv[shift_k + 2 * d] * grad_s; grad = LocalSum(grad, 1, temp); if(local_id == 0) q_gr[shift_q + d] = grad; } } //--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension if(global_id < total_kv) { float grad = 0; for(int q_id = 0; q_id < total_q; q_id++) { //--- 1. Score grad local_id -> score_id/v_id float grad_s = 0; const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, local_id); const int shift_s = RCtoFlat(h_id / 2, local_id, total_heads / 2, total_kv + total_mask, q_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); if(local_id < total_kv) for(int d = 0; d < dimension; d++) grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0); //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); BarrierLoc if(global_id == local_id) temp[0] = grad_s; BarrierLoc grad_s = temp[0]; //--- 3. Key grad local_id -> dimension shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id); if(local_id < dimension) grad += IsNaNOrInf(q[shift_q] * grad_s, 0); } const int shift_k = RCtoFlat(h_id, 2 * local_id, total_heads, 2 * dimension, global_id); if(local_id < dimension) kv_gr[shift_k] = IsNaNOrInf(grad, 0); } } else { //--- Value Gradient global_id -> v_id, local_id -> mask_index/dimension if(global_id < total_kv) { float grad = 0; for(int q_id = 0; q_id < total_q; q_id++) { //--- 1. kv_id int kv_id = -1; float m = 0; const int l = RCtoFlat(q_id, local_id, total_q, total_mask, 0); const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id); //--- Check for use current Value if(local_id < total_mask) kv_id = (int)label[l]; if(local_id == 0) temp[0] = 0; BarrierLoc if(kv_id == global_id) temp[0] = scores[shift_s]; BarrierLoc if(temp[0] == 0) continue; //--- Value grad int shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id); if(local_id < dimension) grad += IsNaNOrInf(temp[0] * out_gr[shift_q], 0); } const int shift_v = RCtoFlat(h_id, 2 * local_id + 1, total_heads, 2 * dimension, global_id); if(local_id < dimension) kv_gr[shift_v] = IsNaNOrInf(grad, 0); } //--- Query Gradient global_id -> q_id, local_id -> mask label if(global_id < total_q) { //--- 1. kv_id; int kv_id = -1; float m = 0; const int l = RCtoFlat(global_id, local_id, total_q, total_mask, 0); if(local_id < total_mask) { kv_id = (int)IsNaNOrInf(label[l], -1); m = IsNaNOrInf(mask[l], 0); } //--- 2. Score grad float grad_s = 0; const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, kv_id); const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, global_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, global_id); if(local_id < total_mask) for(int d = 0; d < dimension; d++) grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0); //--- 3. SoftMax grad float score = IsNaNOrInf(scores[shift_s], 0); grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); mask_gr[l] = IsNaNOrInf(grad_s * score, 0); grad_s *= m; //--- 4. Query grad const int shift_k = shift_v - 1; for(int d = 0; d < dimension; d++) { float grad = 0; if(local_id < total_mask) grad = kv[shift_k + 2 * d] * grad_s; grad = LocalSum(grad, 1, temp); if(local_id == 0) q_gr[shift_q + d] = grad; } } //--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension if(global_id < total_kv) { float grad = 0; for(int q_id = 0; q_id < total_q; q_id++) { //--- 1. kv_id; int kv_id = -1; float m = 0; const int l = RCtoFlat(global_id, local_id, total_q, total_mask, 0); if(local_id < total_mask) { kv_id = (int)label[l]; if(kv_id == global_id) m = mask[l]; } m = LocalSum(m, 1, temp); if(m == 0) continue; //--- 2. Score grad local_id -> score_id/v_id float grad_s = 0; const int shift_v = RCtoFlat(h_id, 1, total_heads, 2 * dimension, kv_id); const int shift_s = RCtoFlat(h_id / 2, total_kv + local_id, total_heads / 2, total_kv + total_mask, q_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); if(local_id < total_mask) for(int d = 0; d < dimension; d++) grad_s += IsNaNOrInf(kv[shift_v + 2 * d] * out_gr[shift_q + d], 0); //--- 3. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); BarrierLoc if(global_id == local_id) temp[0] = grad_s * m; BarrierLoc grad_s = temp[0]; //--- 4. Key grad local_id -> dimension shift_q = RCtoFlat(h_id, local_id, total_heads, dimension, q_id); if(local_id < dimension) grad += IsNaNOrInf(q[shift_q] * grad_s, 0); } const int shift_k = RCtoFlat(h_id, 2 * local_id, total_heads, 2 * dimension, global_id); if(local_id < dimension) kv_gr[shift_k] = IsNaNOrInf(grad, 0); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseSoftMax(__global const float *data, __global float *outputs, __global float *indexes, const int out_dimension ) { const size_t row = get_global_id(0); const size_t col_in = get_local_id(1); const int total_rows = (int)get_global_size(0); const int total_cols_in = (int)get_local_size(1); //--- __local float Temp[LOCAL_ARRAY_SIZE]; const int ls = min(total_cols_in, (int)LOCAL_ARRAY_SIZE); //--- const int shift_in = RCtoFlat(row, col_in, total_rows, total_cols_in, 0); //--- calc position float value = IsNaNOrInf(data[shift_in], MIN_VALUE); int position = 0; for(int l = 0; l < total_cols_in; l += ls) { if(col_in >= l && col_in < (l + ls)) Temp[col_in - l] = value; BarrierLoc for(int i = 0; i < ls; i++) { if(i == (col_in - l)) continue; if(Temp[i] > value) position++; else if(Temp[i] == value && i < (col_in - l)) position++; } BarrierLoc } //--- SoftMax if(position >= out_dimension) value = MIN_VALUE; value = LocalSoftMax(value, 1, Temp); //--- result const int shift_out = RCtoFlat(row, position, total_rows, out_dimension, 0); if(position < out_dimension) { outputs[shift_out] = value; indexes[shift_out] = (float)col_in; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseSoftMaxGrad(__global float *data_gr, __global const float *outputs, __global const float *outputs_gr, __global const float *indexes, const int out_dimension ) { const size_t row = get_global_id(0); const size_t col_in = get_local_id(1); const int total_rows = (int)get_global_size(0); const int total_cols_in = (int)get_local_size(1); //--- __local int Ind[LOCAL_ARRAY_SIZE]; __local float Temp[LOCAL_ARRAY_SIZE]; const int ls = min(total_cols_in, (int)LOCAL_ARRAY_SIZE); //--- look position float value = 0; float grad = 0; int position = -1; int idx = -1; const int shift_idx = RCtoFlat(row, col_in, total_rows, out_dimension, 0); if(col_in < out_dimension) idx = (int)IsNaNOrInf(indexes[shift_idx], -1.0f); for(int l = 0; l < out_dimension; l += ls) { if(col_in >= l && col_in < (l + ls)) Ind[col_in - l] = idx; BarrierLoc for(int i = 0; (i < ls && position < 0); i++) { if(Ind[i] == col_in) position = l + i; } BarrierLoc } //--- SoftMax Grad if(position < out_dimension && position >= 0) { const int shift_out = RCtoFlat(row, position, total_rows, out_dimension, 0); value = IsNaNOrInf(outputs[shift_out], 0); grad = IsNaNOrInf(outputs_gr[shift_out], 0); } grad = LocalSoftMaxGrad(value, grad, 1, Temp); //--- result const int shift_in = RCtoFlat(row, col_in, total_rows, total_cols_in, 0); data_gr[shift_in] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FloatToSpike(__global float* values, __global const float* levels, __global float* outputs ) { const size_t id = get_global_id(0); float val = IsNaNOrInf(values[id], 0.0f); if(val == 0.0f) outputs[id] = 0.0f; else { const float lev = IsNaNOrInf(levels[id], 0.0f); if(fabs(val) < lev) outputs[id] = 0.0f; else { outputs[id] = (float)sign(val); values[id] = IsNaNOrInf(sign(val) * (fabs(val) - lev), 0.0f); } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void FloatToSpikeGrad(__global const float* values, __global float* values_gr, __global float* levels_gr, __global const float* gradients ) { const size_t id = get_global_id(0); const float grad = IsNaNOrInf(gradients[id], 0.0f); values_gr[id] = grad; if(fabs(grad) > 0.0f) { float val = IsNaNOrInf(values[id], 0.0f); levels_gr[id] = (float)(-sign(val) * grad); } else levels_gr[id] = 0.0f; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SpikeMHAttention(__global const float *qkv, __global const float *diag_bias, __global float *scores, __global float *out, const int dimension, const int mask_future ) { //--- init const int q_id = get_global_id(0); const int k_id = get_local_id(1); const int h_id = get_global_id(2); const int total_q = get_global_size(0); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Shifts const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * q_id); const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * k_id + 1); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * k_id + 2); const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_q, q_id); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, q_id); //--- Score float score = 0; if(mask_future == 0 || q_id <= k_id) { for(int d = 0; d < dimension; d++) { float q = IsNaNOrInf(qkv[shift_q + d], 0); if(q == 0) continue; float k = IsNaNOrInf(qkv[shift_k + d], 0); if(k == 0) continue; score += q * k; } } else score = MIN_VALUE; if(q_id == k_id) score += IsNaNOrInf(diag_bias[q_id], 0); //--- norm score score = LocalSoftMax(score, 1, temp); scores[shift_s] = score; //--- out for(int d = 0; d < dimension; d++) { float val = 0; if(score > 0) { float v = IsNaNOrInf(qkv[shift_v + d], 0); if(v != 0) val = v * score; } val = LocalSum(val, 1, temp); if(k_id == 0) out[shift_out + d] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SpikeMHAttentionGrad(__global const float *qkv, __global float *qkv_gr, __global const float *diag_bias, __global float *diag_bias_gr, __global const float *scores, __global const float *gradients, const int dimension, const int mask_future ) { //--- init const int global_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_global = get_global_size(0); const int total_local = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Value Gradient global_id -> v_id, local_id -> q_id { //--- Shifts const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id + 2); const int shift_s = RCtoFlat(h_id, global_id, total_heads, total_global, local_id); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id); for(int d = 0; d < dimension; d++) { float grad = 0; if(mask_future == 0 || local_id <= global_id) { float score = IsNaNOrInf(scores[shift_s], 0); if(score > 0) grad = IsNaNOrInf(score * gradients[shift_out + d], 0); } grad = LocalSum(grad, 1, temp); if(local_id == 0) qkv_gr[shift_v + d] = grad; } } //--- Query Gradient global_id -> q_id, local_id -> k_id/v_id { //--- Shifts const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id); const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 1); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 2); const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_local, global_id); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, global_id); //--- 1. Score grad float grad_s = 0; if(mask_future == 0 || global_id <= local_id) for(int d = 0; d < dimension; d++) { float val = IsNaNOrInf(qkv[shift_v + d], 0); if(val == 0) continue; grad_s += IsNaNOrInf(qkv[shift_v + d] * gradients[shift_out + d], 0); } //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); if(global_id == local_id) diag_bias_gr[global_id] = grad_s; //--- 3. Query grad for(int d = 0; d < dimension; d++) { float grad = 0; if(mask_future == 0 || global_id <= local_id) { float key = IsNaNOrInf(qkv[shift_k + d], 0); if(key != 0) grad = key * grad_s; } grad = LocalSum(grad, 1, temp); if(local_id == 0) qkv_gr[shift_q + d] = grad; } } //--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension { //--- Shifts const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 3 * global_id + 1); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 3 * local_id + 2); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id); float grad = 0; for(int q_id = 0; q_id < total_local; q_id++) { //--- 1. Score grad local_id -> score_id/v_id float grad_s = 0; const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_local, q_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, 3 * q_id); if(mask_future == 0 || q_id <= local_id) for(int d = 0; d < dimension; d++) { float val = IsNaNOrInf(qkv[shift_v + d], 0); if(val == 0) continue; grad_s += IsNaNOrInf(val * gradients[shift_q + d], 0); } //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); BarrierLoc if(global_id == local_id) temp[0] = grad_s; BarrierLoc grad_s = temp[0]; //--- 3. Key grad local_id -> dimension if(local_id < dimension) { float query = IsNaNOrInf(qkv[shift_q + local_id], 0); if(query != 0) grad += IsNaNOrInf(query * grad_s, 0); } } if(local_id < dimension) qkv_gr[shift_k + local_id] = IsNaNOrInf(grad, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void STFS(__global const float* inputs, __global const float* mask_time, __global const float* mask_spatial, __global float* outputs ) { const size_t time_id = get_global_id(0); const size_t spat_id = get_global_id(1); const size_t head = get_local_id(2); const size_t total_times = get_global_size(0); const size_t total_spats = get_global_size(1); const size_t total_heads = get_local_size(2); //--- __local float temp[3]; //--- const int shift_in = RCtoFlat(time_id, spat_id, total_times, total_spats, 1); const int shift_out = RCtoFlat(time_id, spat_id, total_times, total_spats, head); //--- switch(head) { case 0: temp[0] = IsNaNOrInf(inputs[shift_in], 0); break; case 1: temp[1] = IsNaNOrInf(mask_time[time_id], 0); break; case 2: temp[2] = IsNaNOrInf(mask_spatial[spat_id], 0); break; } BarrierLoc float out = temp[0]; if(out != 0) switch(head) { case 1: out *= temp[1]; break; case 2: out *= (1 - temp[1]); break; case 3: out *= temp[2]; break; case 4: out *= (1 - temp[2]); break; } //--- outputs[shift_out] = IsNaNOrInf(out, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void STFSGrad(__global float* inputs_gr, __global const float* mask_time, __global const float* mask_spatial, __global const float* outputs_gr ) { const size_t time_id = get_global_id(0); const size_t spat_id = get_global_id(1); const size_t head = get_local_id(2); const size_t total_times = get_global_size(0); const size_t total_spats = get_global_size(1); const size_t total_heads = get_local_size(2); //--- __local float temp[5]; //--- const int shift_in = RCtoFlat(time_id, spat_id, total_times, total_spats, 1); const int shift_out = RCtoFlat(time_id, spat_id, total_times, total_spats, head); //--- switch(head) { case 0: temp[1] = IsNaNOrInf(mask_time[time_id], 0); break; case 1: temp[2] = IsNaNOrInf(mask_spatial[spat_id], 0); break; } BarrierLoc float grad = IsNaNOrInf(outputs_gr[shift_out], 0); if(grad != 0) switch(head) { case 1: grad *= temp[1]; break; case 2: grad *= (1 - temp[1]); break; case 3: grad *= temp[2]; break; case 4: grad *= (1 - temp[2]); break; } //--- grad = LocalSum(grad, 2, temp); BarrierLoc if(head == 0) inputs_gr[shift_in] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AddToStack(__global const float* inputs, __global float* stack, const int stack_size) { const size_t id = get_global_id(0); const size_t loc_id = get_local_id(1); const size_t var = get_global_id(2); const size_t dimension = get_global_size(0); const size_t total_loc = get_local_size(1); const size_t variables = get_global_size(2); //--- const int total = (stack_size - 1) / total_loc; for(int i = total; i >= 0; i--) { int inp = 0; if(i == 0 && loc_id == 0) inp = IsNaNOrInf(inputs[RCtoFlat(var, id, variables, dimension, 1)], 0); else if((i * total_loc + loc_id) < stack_size) { int shift = RCtoFlat(i * total_loc + loc_id - 1, id, stack_size, dimension, var); inp = IsNaNOrInf(stack[shift], 0); } BarrierLoc if((i * total_loc + loc_id) < stack_size) { int shift = RCtoFlat(i * total_loc + loc_id, id, stack_size, dimension, var); stack[shift] = inp; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AggregationByTime(__global const float* inputs, __global const float* stack, __global float* outputs, const int stack_size, const int levels ) { const size_t id = get_global_id(0); const size_t var = get_global_id(1); const size_t dimension = get_global_size(0); const size_t variables = get_global_size(1); //--- float val = IsNaNOrInf(inputs[RCtoFlat(var, id, variables, dimension, 0)], 0); outputs[RCtoFlat(var, id, variables, dimension, 0)] = val; for(int l = 1; l < levels; l++) { int total = 1 << l; int start = total - 1; val /= total; for(int s = 0; s < total; s++) { if(s + start >= stack_size) continue; val += IsNaNOrInf(stack[RCtoFlat(var, id, variables * levels, dimension, start + s)] / total, 0); } outputs[RCtoFlat(var, id, variables, dimension, l)] = IsNaNOrInf(val, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void AggregationByTimeGrad(__global float* inputs_gr, __global const float* outputs_gr, const int levels ) { const size_t id = get_global_id(0); const size_t var = get_global_id(1); const size_t dimension = get_global_size(0); const size_t variables = get_global_size(1); //--- float grad = 0; for(int l = 0; l < levels; l++) { int total = 1 << l; grad += IsNaNOrInf(outputs_gr[RCtoFlat(var, id, variables, dimension, l)] / total, 0); } inputs_gr[RCtoFlat(var, id, variables, dimension, 0)] = IsNaNOrInf(grad, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GRU(__global const float* XH, __global const float* prev_state, __global float* outputs ) { const size_t id = get_global_id(0); const size_t d = get_global_id(1); const size_t units = get_global_size(0); const size_t dimension = get_global_size(1); //--- const float xz = IsNaNOrInf(XH[RCtoFlat(0, d, 6, dimension, id)], 0); const float xr = IsNaNOrInf(XH[RCtoFlat(1, d, 6, dimension, id)], 0); const float xh = IsNaNOrInf(XH[RCtoFlat(2, d, 6, dimension, id)], 0); const float hz = IsNaNOrInf(XH[RCtoFlat(3, d, 6, dimension, id)], 0); const float hr = IsNaNOrInf(XH[RCtoFlat(4, d, 6, dimension, id)], 0); const float hh = IsNaNOrInf(XH[RCtoFlat(5, d, 6, dimension, id)], 0); const float prev = IsNaNOrInf(prev_state[RCtoFlat(id, d, units, dimension, 0)], 0); //--- float r = fActivation(xr + hr, ActFunc_SIGMOID); float z = fActivation(xz + hz, ActFunc_SIGMOID); float ht = fActivation(r * hh + xh, ActFunc_TANH); float out = (1 - z) * prev + z * ht; //--- outputs[RCtoFlat(id, d, units, dimension, 0)] = IsNaNOrInf(out, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void GRU_Grad(__global const float* XH, __global float * XH_gr, __global const float* prev_state, __global const float* outputs_gr ) { const size_t id = get_global_id(0); const size_t d = get_global_id(1); const size_t units = get_global_size(0); const size_t dimension = get_global_size(1); //--- const float xz = IsNaNOrInf(XH[RCtoFlat(0, d, 6, dimension, id)], 0); const float xr = IsNaNOrInf(XH[RCtoFlat(1, d, 6, dimension, id)], 0); const float xh = IsNaNOrInf(XH[RCtoFlat(2, d, 6, dimension, id)], 0); const float hz = IsNaNOrInf(XH[RCtoFlat(3, d, 6, dimension, id)], 0); const float hr = IsNaNOrInf(XH[RCtoFlat(4, d, 6, dimension, id)], 0); const float hh = IsNaNOrInf(XH[RCtoFlat(5, d, 6, dimension, id)], 0); const float prev = IsNaNOrInf(prev_state[RCtoFlat(id, d, units, dimension, 0)], 0); const float grad = IsNaNOrInf(outputs_gr[RCtoFlat(id, d, units, dimension, 0)], 0); //--- float r = fActivation(xr + hr, ActFunc_SIGMOID); float z = fActivation(xz + hz, ActFunc_SIGMOID); float ht = fActivation(r * hh + xh, ActFunc_TANH); //--- float ht_grad = IsNaNOrInf(grad * z, 0); float z_grad = IsNaNOrInf(grad * (ht - prev), 0); float xh_grad = Deactivation(ht_grad, ht, ActFunc_TANH); float hh_grad = IsNaNOrInf(xh_grad * r, 0); float r_grad = IsNaNOrInf(xh_grad * hh, 0); float xz_grad = Deactivation(z_grad, z, ActFunc_SIGMOID); float hz_grad = xz_grad; float xr_grad = Deactivation(r_grad, r, ActFunc_SIGMOID); float hr_grad = xr_grad; //--- XH_gr[RCtoFlat(0, d, 6, dimension, id)] = IsNaNOrInf(xz_grad, 0); XH_gr[RCtoFlat(1, d, 6, dimension, id)] = IsNaNOrInf(xr_grad, 0); XH_gr[RCtoFlat(2, d, 6, dimension, id)] = IsNaNOrInf(xh_grad, 0); XH_gr[RCtoFlat(3, d, 6, dimension, id)] = IsNaNOrInf(hz_grad, 0); XH_gr[RCtoFlat(4, d, 6, dimension, id)] = IsNaNOrInf(hr_grad, 0); XH_gr[RCtoFlat(5, d, 6, dimension, id)] = IsNaNOrInf(hh_grad, 0); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ScalarToVector(__global const float* scalar, __global const float* vector_in, __global float* vector_out ) { const size_t vec = get_global_id(0); const size_t d = get_global_id(1); const size_t vectors = get_global_size(0); const size_t dimension = get_global_size(1); //--- float sc = IsNaNOrInf(scalar[vec], 0.0f); int shift = RCtoFlat(vec, d, vectors, dimension, 0); float v = IsNaNOrInf(vector_in[shift], 0.0f); //--- vector_out[shift] = IsNaNOrInf(sc * v, 0.0f); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void ScalarToVectorGrad(__global const float* scalar, __global float* scalar_gr, __global const float* vector_in, __global float* vector_in_gr, __global float* vector_out_gr, const int dimension ) { const size_t vec = get_global_id(0); const size_t loc = get_local_id(1); const size_t vectors = get_global_size(0); const size_t total_loc = get_local_size(1); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- if(loc == 0) temp[0] = IsNaNOrInf(scalar[vec], 0.0f); BarrierLoc float sc = temp[0]; float sc_gr = 0; for(int d = loc; d < dimension; d += total_loc) { int shift = RCtoFlat(vec, d, vectors, dimension, 0); float v = IsNaNOrInf(vector_in[shift], 0.0f); float grad = IsNaNOrInf(vector_out_gr[shift], 0.0f); vector_in_gr[shift] = IsNaNOrInf(grad * sc, 0.0f); sc_gr += IsNaNOrInf(v * grad, 0.0f); } //--- sc_gr = LocalSum(sc_gr, 1, temp); if(loc == 0) scalar_gr[vec] = sc_gr; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void CalcFlow(__global const float* value, __global float* prev_value, __global float* flow ) { const size_t id = get_global_id(0); const size_t total = get_global_size(0); //--- const float v = IsNaNOrInf(value[id], 0); const float p = IsNaNOrInf(prev_value[id], 0); flow[id] = IsNaNOrInf(v - p, 0); prev_value[id] = v; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DilatedCorrelation(__global const float* feature, __global const int* shifts, __global float* correlations, const int dimension ) { const size_t main = get_global_id(0); const size_t loc = get_local_id(1); const size_t sh = get_global_id(2); const size_t units = get_global_size(0); const size_t total_loc = get_local_size(1); const size_t total_corr = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- const int slave = main + shifts[sh >> 1] * ((sh & 1) ? -1 : 1); if(slave < 0 || slave >= units) { if(loc == 0) correlations[RCtoFlat(main, sh, units, total_corr, 0)] = 0; return; } //--- float result = 0.0f; for(int d = loc; d < dimension; d += total_loc) { float value_main = IsNaNOrInf(feature[RCtoFlat(main, d, units, dimension, 0)], 0); float value_slave = IsNaNOrInf(feature[RCtoFlat(slave, d, units, dimension, 0)], 0); result += IsNaNOrInf(value_main * value_slave, 0); } result = LocalSum(result, 1, temp); //--- if(loc == 0) correlations[RCtoFlat(main, sh, units, total_corr, 0)] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DilatedCorrelationGrad(__global const float* feature, __global float* feature_gr, __global const int* shifts, __global const float* corr_gr, const int total_corr ) { const size_t id = get_global_id(0); const size_t loc = get_local_id(1); const size_t d = get_global_id(2); const size_t units = get_global_size(0); const size_t total_loc = get_local_size(1); const size_t dimension = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- float result = 0.0f; for(int sh = loc; sh < total_corr; sh += total_loc) { const int offset = shifts[sh >> 1]; const int sign = (sh & 1) ? -1 : +1; // id — main int slave = id + sign * offset; if(slave >= 0 && slave < units) { float g = corr_gr[RCtoFlat(id, sh, units, total_corr, 0)]; result += IsNaNOrInf(g * feature[RCtoFlat(slave, d, units, dimension, 0)], 0.0f); } // id — slave int main = id - sign * offset; if(main >= 0 && main < units) { float g = corr_gr[RCtoFlat(main, sh, units, total_corr, 0)]; result += IsNaNOrInf(g * feature[RCtoFlat(main, d, units, dimension, 0)], 0.0f); } } result = LocalSum(result, 1, temp); if(loc == 0) feature_gr[RCtoFlat(id, d, units, dimension, 0)] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DilatedDifference(__global const float* feature, __global const int* shifts, __global float* differences ) { const size_t main = get_global_id(0); const size_t sh = get_global_id(1); const size_t d = get_global_id(2); const size_t units = get_global_size(0); const size_t total_shifts = get_global_size(1); const size_t dimension = get_global_size(2); //--- const int slave = main + shifts[sh]; if(slave < 0 || slave >= units) { differences[RCtoFlat(main, d, units, dimension, sh)] = 0; return; } //--- float value_main = IsNaNOrInf(feature[RCtoFlat(main, d, units, dimension, 0)], 0); float value_slave = IsNaNOrInf(feature[RCtoFlat(slave, d, units, dimension, 0)], 0); float result = value_main - value_slave; //--- differences[RCtoFlat(main, d, units, dimension, sh)] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void DilatedDifferenceGrad(__global const float* feature, __global float* feature_gr, __global const int* shifts, __global const float* differences_gr, const int total_shifts ) { const size_t id = get_global_id(0); const size_t loc = get_local_id(1); const size_t d = get_global_id(2); const size_t units = get_global_size(0); const size_t total_loc = get_local_size(1); const size_t dimension = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- float result = 0.0f; for(int sh = loc; sh < total_shifts; sh += total_loc) { const int offset = shifts[sh]; int slave = id + offset; if(slave >= 0 && slave < units) { // id — main result += IsNaNOrInf(differences_gr[RCtoFlat(id, d, units, dimension, sh)], 0.0f); // id — slave result -= IsNaNOrInf(differences_gr[RCtoFlat(slave, d, units, dimension, sh)], 0.0f); } } result = LocalSum(result, 1, temp); if(loc == 0) feature_gr[RCtoFlat(id, d, units, dimension, 0)] = result; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PerturbedMatrix(__global const float* inputs, __global const float* perturb, __global float* output, const float perturb_mult) { const size_t id = get_global_id(0); const size_t var = get_global_id(1); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); //--- int shift = RCtoFlat(var, id, variables, total, 0); output[shift] = IsNaNOrInf(inputs[shift] + perturb[shift] * perturb_mult, 0.0f); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void PerturbedMatrixGrad(__global float* inputs_gr, __global float* perturb_gr, __global const float* output_gr, const float perturb_mult) { const size_t id = get_global_id(0); const size_t var = get_global_id(1); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); //--- int shift = RCtoFlat(var, id, variables, total, 0); float grad = IsNaNOrInf(output_gr[shift], 0.0f); inputs_gr[shift] = grad; perturb_gr[shift] = IsNaNOrInf(perturb_mult * grad, 0.0f); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LinearUpsample(__global const float* data, __global float* upsample) { const size_t id_ltr = get_global_id(0); const size_t var = get_global_id(1); const size_t id_htr = get_global_id(2); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); const size_t dimension_htr = get_global_size(2); //--- const float ltr = IsNaNOrInf(data[RCtoFlat(id_ltr, var, total, variables, 0)], 0.0f); const float prev_ltr = (id_ltr > 0 ? IsNaNOrInf(data[RCtoFlat(id_ltr - 1, var, total, variables, 0)], 0.0f) : 0.0f); const float htr = (id_htr < (dimension_htr - 1) ? (float)id_htr / (float)(dimension_htr - 1) * (ltr - prev_ltr) + prev_ltr : ltr); //--- upsample[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr)] = IsNaNOrInf(htr, 0.0f); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void LinearUpsampleGrad(__global float* data_gr, __global const float* upsample_gr, const int dimension_htr) { const size_t id_ltr = get_global_id(0); const size_t var = get_global_id(1); const size_t id_loc = get_local_id(2); const size_t total = get_global_size(0); const size_t variables = get_global_size(1); const size_t total_loc = get_local_size(2); float grad = 0.0f; //--- __local float temp[LOCAL_ARRAY_SIZE]; // --- main ltr { for(int id_htr = id_loc; id_htr < dimension_htr; id_htr += total_loc) { const float g = upsample_gr[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr)]; if(id_htr < dimension_htr - 1) { const float t = (float)id_htr / (float)(dimension_htr - 1); grad += g * t; } else grad += g; } } // --- prev ltr if(id_ltr + 1 < total) for(int id_htr = id_loc; id_htr < dimension_htr; id_htr += total_loc) if(id_htr < dimension_htr - 1) { const float g = upsample_gr[RCtoFlat(id_htr, var, dimension_htr, variables, id_ltr + 1)]; const float t = (float)id_htr / (float)(dimension_htr - 1); grad += g * (1.0f - t); } // --- grad = LocalSum(grad, 2, temp); if(id_loc == 0) data_gr[RCtoFlat(id_ltr, var, total, variables, 0)] = grad; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MixExpertsPredict(__global const float4* __attribute__((aligned(16))) experts, __global float* outputs ) { const size_t id = get_global_id(0); //--- float4 expert = experts[id]; float mu = IsNaNOrInf(expert.s0, 0.0f); float alpha = fActivation(expert.s1, ActFunc_SoftPlus); float sigma = fActivation(expert.s2, ActFunc_SoftPlus); float txi = fActivation(expert.s3, ActFunc_TANH); float out = mu + alpha * sigma * txi; //--- outputs[id] = IsNaNOrInf(out, 0.0f); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MixExpertsPredictGrad(__global const float4* __attribute__((aligned(16))) experts, __global float4* __attribute__((aligned(16))) experts_gr, __global const float* outputs_gr ) { const size_t id = get_global_id(0); //--- float4 expert = experts[id]; float grad = IsNaNOrInf(outputs_gr[id], 0.0f); float4 expert_gr = (float4)0.0f; //--- float alpha = fActivation(expert.s1, ActFunc_SoftPlus); float sigma = fActivation(expert.s2, ActFunc_SoftPlus); float txi = fActivation(expert.s3, ActFunc_TANH); //--- float mu_grad = grad; float alpha_grad = Deactivation(grad * sigma * txi, alpha, ActFunc_SoftPlus); float sigma_grad = Deactivation(grad * alpha * txi, sigma, ActFunc_SoftPlus); float txi_grad = Deactivation(grad * sigma * alpha, txi, ActFunc_TANH); //--- experts_gr[id] = (float4)(mu_grad, alpha_grad, sigma_grad, txi_grad); } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFAT(__global const float *q, __global const float *kv, __global const float *scale, __global float *scores, __global float *out, const int dimension, const int mask_future ) { //--- init const int q_id = get_global_id(0); const int k_id = get_local_id(1); const int h_id = get_global_id(2); const int total_q = get_global_size(0); const int total_k = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Shifts const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id + 1); const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_k, q_id); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, q_id); //--- Score float score = 0; if(mask_future == 0 || q_id <= k_id) { float sc = IsNaNOrInf(scale[shift_s], 0.0f); if(sc != 0) { for(int d = 0; d < dimension; d++) { float q_ = IsNaNOrInf(q[shift_q + d], 0.0f); if(q_ == 0) continue; float k = IsNaNOrInf(kv[shift_k + d], 0.0f); if(k == 0) continue; score += q_ * k; } score *= sc; } else score = MIN_VALUE; } else score = MIN_VALUE; //--- norm score score = LocalSoftMax(score, 1, temp); scores[shift_s] = score; //--- out for(int d = 0; d < dimension; d++) { float val = 0; if(score > 0) { float v = IsNaNOrInf(kv[shift_v + d], 0); if(v != 0) val = v * score; } val = LocalSum(val, 1, temp); if(k_id == 0) out[shift_out + d] = val; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFATGrad(__global const float *q, __global float *q_gr, __global const float *kv, __global float *kv_gr, __global const float *scale, __global float *scale_gr, __global const float *scores, __global const float *gradients, const int dimension, const int total_k, const int mask_future ) { //--- init const int global_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_global = get_global_size(0); const int total_local = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; //--- Value Gradient global_id -> v_id, local_id -> q_id for(int d = 0; d < dimension; d++) { for(int v_id = global_id; v_id < total_k; v_id += total_global) { float grad = 0; //--- Shifts const int shift_v = RCtoFlat(h_id, d, total_heads, dimension, 2 * v_id + 1); for(int q_id = 0; q_id < total_global; q_id += total_local) { const int shift_s = RCtoFlat(h_id, v_id, total_heads, total_k, q_id + local_id); const int shift_out = RCtoFlat(h_id, d, total_heads, dimension, q_id + local_id); if((q_id + local_id) < total_global) if(mask_future == 0 || (q_id + local_id) <= v_id) { float score = IsNaNOrInf(scores[shift_s], 0.0f); if(score > 0) grad += IsNaNOrInf(score * gradients[shift_out], 0.0f); } } grad = LocalSum(grad, 1, temp); if(local_id == 0) kv_gr[shift_v] = grad; } } //--- Query Gradient global_id -> q_id, local_id -> k_id/v_id for(int d_q = 0; d_q < dimension; d_q++) { //--- Shifts const int shift_q = RCtoFlat(h_id, d_q, total_heads, dimension, global_id); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, global_id); float grad = 0; for(int id = 0; id < total_k; id += total_local) { int k_id = id + local_id; const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id + 1); const int shift_s = RCtoFlat(h_id, k_id, total_heads, total_k, global_id); //--- 1. Score grad float grad_s = 0; float score = 0; float sc = 0; if(k_id < total_k) { if(mask_future == 0 || global_id <= local_id) for(int d = 0; d < dimension; d++) { float val = IsNaNOrInf(kv[shift_v + d], 0); if(val == 0.0f) continue; grad_s += IsNaNOrInf(kv[shift_v + d] * gradients[shift_out + d], 0); } score = scores[shift_s]; sc = IsNaNOrInf(scale[shift_s], 0.0f); } //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(score, grad_s, 1, temp); float grad_sc = LocalSum(score * grad_s, 1, temp); if(local_id == 0 && k_id < total_k) { if(sc != 0.0f) scale_gr[shift_s] = grad_sc * log(scores[shift_s]) / sc; else scale_gr[shift_s] = grad_sc; } grad_s *= sc; //--- 3. Query grad if(grad_s != 0.0f) if(mask_future == 0 || global_id <= k_id) { float key = IsNaNOrInf(kv[shift_k + d_q], 0.0f); if(key != 0.0f) grad += key * grad_s; } } grad = LocalSum(grad, 1, temp); if(local_id == 0) q_gr[shift_q] = grad; } //--- Key Gradient global_id -> k_id, local_id -> score_id/v_id/dimension for(int k_id = global_id; k_id < total_k; k_id += total_global) { //--- Shifts const int shift_k = RCtoFlat(h_id, 0, total_heads, dimension, 2 * k_id); const int shift_v = RCtoFlat(h_id, 0, total_heads, dimension, 2 * local_id + 1); const int shift_out = RCtoFlat(h_id, 0, total_heads, dimension, local_id); float grad = 0; for(int q_id = 0; q_id < total_global; q_id++) { //--- 1. Score grad local_id -> score_id/v_id float grad_s = 0; const int shift_s = RCtoFlat(h_id, local_id, total_heads, total_k, q_id); int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); float score = 0; float sc = 0; if(local_id < total_k) { if(mask_future == 0 || q_id <= local_id) for(int d = 0; d < dimension; d++) { float val = IsNaNOrInf(kv[shift_v + d], 0); if(val == 0) continue; grad_s += IsNaNOrInf(val * gradients[shift_q + d], 0); } score = scores[shift_s]; sc = IsNaNOrInf(scale[shift_s], 0.0f); } //--- 2. SoftMax grad grad_s = LocalSoftMaxGrad(scores[shift_s], grad_s, 1, temp); grad_s *= sc; //--- 3. Key grad local_id -> dimension if(local_id < dimension) { float query = IsNaNOrInf(q[shift_q + local_id], 0); if(query != 0) grad += IsNaNOrInf(query * grad_s, 0); } } if(local_id < dimension) kv_gr[shift_k + local_id] = IsNaNOrInf(grad, 0); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseConcatenate(__global const float *sparse_index, __global const float *sparse_data, __global const float *full, __global float *result, const int full_rows ) { const size_t sparse_row = get_global_id(0); const size_t sparse_col = get_global_id(1); const size_t full_col = get_global_id(2); const size_t sparse_rows = get_global_size(0); const size_t sparse_cols = get_global_size(1); const size_t full_cols = get_global_size(2); //--- const int shift_sparse = RCtoFlat(sparse_row, sparse_col, sparse_rows, sparse_cols, 0); const int full_row = sparse_index[shift_sparse]; const int shift_full = RCtoFlat(full_row, full_col, full_rows, full_cols, 0); const int shift_out = RCtoFlat(sparse_col, full_col, sparse_cols, full_cols, sparse_row); //--- float res = (full_row >= 0 && full_row < full_rows ? IsNaNOrInf(sparse_data[shift_sparse] * full[shift_full], 0) : 0.0f); result[shift_out] = res; } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void SparseConcatenateGrad(__global const float *sparse_index, __global const float *sparse_data, __global float *sparse_gr, __global const float *full, __global float *full_gr, __global const float *result_gr, const int sparse_rows, const int sparse_cols, const int full_rows, const int full_cols ) { const size_t row_id = get_global_id(0); const size_t local_id = get_local_id(1); const size_t col_id = get_global_id(2); const size_t total_rows = get_global_size(0); const size_t total_local = get_local_size(1); const size_t total_cols = get_global_size(2); //--- __local float Temp[LOCAL_ARRAY_SIZE]; //--- Calce sparse gradient if(row_id < sparse_rows && col_id < sparse_cols) { float grad = 0; int shift_sparse = 0; if(local_id == 0) { shift_sparse = RCtoFlat(row_id, col_id, sparse_rows, sparse_cols, 0); Temp[0] = sparse_index[shift_sparse]; } BarrierLoc uint full_row = (uint)Temp[0]; if(full_row < (uint)full_rows) for(int i = local_id; i < full_cols; i += total_local) { int shift_out = RCtoFlat(col_id, i, sparse_cols, full_cols, row_id); int shift_full = RCtoFlat(full_row, i, full_rows, full_cols, 0); grad += IsNaNOrInf(result_gr[shift_out] * full[shift_full], 0.0f); } grad = LocalSum(grad, 1, Temp); if(local_id == 0) sparse_gr[shift_sparse] = grad; } //--- Calce full gradient if(row_id < full_rows && col_id < full_cols) { float grad = 0; for(int r = 0; r < sparse_rows; r ++) { float s = 0; for(int c = local_id; c < sparse_cols; c += total_local) { int shift_sparse = RCtoFlat(r, c, sparse_rows, sparse_cols, 0); if((uint)sparse_index[shift_sparse] == (uint)row_id) { s = sparse_data[shift_sparse]; int shift_out = RCtoFlat(c, col_id, sparse_cols, full_cols, r); grad += IsNaNOrInf(s * result_gr[shift_out], 0.0f); break; } } } grad = LocalSum(grad, 1, Temp); if(local_id == 0) { int shift_full = RCtoFlat(row_id, col_id, full_rows, full_cols, 0); full_gr[shift_full] = grad; } } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFlashAttention(__global const float *query, __global const float *key_value, __global float *logsumexp, __global float *output, const int dimension, const int total_kv, const int mask_future ) { //--- init const int q_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_q = get_global_size(0); const int total_loc = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; __local float4 temp4[LOCAL_ARRAY_SIZE]; //--- const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); float prev_max = MIN_VALUE; float sumexp = 0; float out = 0; for(int id = local_id; id < total_kv; id += total_loc) { int k_id = id + local_id; const int shift_k = RCtoFlat(h_id, 0, 2 * total_heads, dimension, k_id); const int shift_v = RCtoFlat(h_id + total_heads, 0, 2 * total_heads, dimension, k_id); //--- Score float score = 0; if(k_id < total_kv && (mask_future == 0 || q_id <= k_id)) { for(int d = 0; d < dimension; d += 4) { float4 q = IsNaNOrInf4((float4)( (d < dimension ? query[shift_q + d] : 0.0f), ((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f) ), 0.0f); float4 k = IsNaNOrInf4((float4)( (d < dimension ? key_value[shift_k + d] : 0.0f), ((d + 1) < dimension ? key_value[shift_k + d + 1] : 0.0f), ((d + 2) < dimension ? key_value[shift_k + d + 2] : 0.0f), ((d + 3) < dimension ? key_value[shift_k + d + 3] : 0.0f) ), 0.0f); score += IsNaNOrInf(dot(q, k), 0.0f); } score /= sqrt((float)dimension); } else score = MIN_VALUE; //--- norm score float max = fmax(prev_max, LocalMax(score, 1, temp)); if(score > MIN_VALUE) score = exp(score - max); else score = 0.0f; if(sumexp == 0.0f) sumexp = LocalSum(score, 1, temp); else sumexp = IsNaNOrInf(exp(prev_max - max) * sumexp + LocalSum(score, 1, temp), 0.0f); for(int d = 0; d < dimension; d += 4) { float4 val = (float4)0.0f; if(score > 0.0f && k_id < total_kv) { float4 v = (float4)( (d < dimension ? key_value[shift_v + d] : 0.0f), ((d + 1) < dimension ? key_value[shift_v + d + 1] : 0.0f), ((d + 2) < dimension ? key_value[shift_v + d + 2] : 0.0f), ((d + 3) < dimension ? key_value[shift_v + d + 3] : 0.0f) ); val = IsNaNOrInf4(v * score, 0.0f); } val = LocalSum4(val, 1, temp4); int idx = local_id - d; if(idx >= 0 && idx < 4) { if(out != 0.0f) out = IsNaNOrInf(exp(prev_max - max) * out + val[idx], 0.0f); else out = val[idx]; } } prev_max = max; } if(local_id < dimension) { if(sumexp > 0.0f) output[shift_q + local_id] = IsNaNOrInf(out / sumexp, 0.0f); else output[shift_q + local_id] = 0.0f; } if(local_id == 0) { int shift_logse = RCtoFlat(q_id, h_id, total_q, total_heads, 0); if(sumexp > 0.0f) logsumexp[shift_logse] = IsNaNOrInf(prev_max + log(sumexp), 0.0f); else logsumexp[shift_logse] = 0.0f; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFlashAttentionGrad(__global const float *query, __global float *query_gr, __global const float *key_value, __global float *key_value_gr, __global const float *logsumexp, __global const float *output, __global const float *output_gr, const int dimension, const int total_q, const int total_kv, const int mask_future ) { const int id = get_global_id(0); const int d_id = get_local_id(1); const int h_id = get_global_id(2); const int total_heads = get_global_size(2); __local float temp[LOCAL_ARRAY_SIZE]; //--- Query gradient: dQ[q, d] if(id < total_q) { const int q_id = id; const int shift_q = RCtoFlat(h_id, d_id, total_heads, dimension, q_id); const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0); const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f); const float q_d = IsNaNOrInf(query[shift_q], 0.0f); const float g_d = IsNaNOrInf(output_gr[shift_q], 0.0f); const float o_d = IsNaNOrInf(output[shift_q], 0.0f); const float D = LocalSum(IsNaNOrInf(g_d * o_d, 0.0f), 1, temp); float grad_q = 0.0f; for(int k_id = 0; k_id < total_kv; k_id++) { if(mask_future != 0 && q_id > k_id) continue; const int shift_k = RCtoFlat(h_id, d_id, 2 * total_heads, dimension, k_id); const int shift_v = RCtoFlat(h_id + total_heads, d_id, 2 * total_heads, dimension, k_id); const float k_d = IsNaNOrInf(key_value[shift_k], 0.0f); const float v_d = IsNaNOrInf(key_value[shift_v], 0.0f); const float s = LocalSum(IsNaNOrInf(q_d * k_d, 0.0f), 1, temp) / sqrt((float)dimension); const float p = IsNaNOrInf(exp(clamp(s - lse, -120.0f, 0.0f)), 0.0f); if(p == 0.0f) continue; const float dp = LocalSum(IsNaNOrInf(g_d * v_d, 0.0f), 1, temp); const float ds = IsNaNOrInf(p * (dp - D), 0.0f); grad_q += IsNaNOrInf(k_d * ds, 0.0f); } query_gr[shift_q] = IsNaNOrInf(grad_q, 0.0f); } //--- Key & Value gradients: dK[k, d], dV[k, d] if(id < total_kv) { const int k_id = id; const int shift_k = RCtoFlat(h_id, d_id, 2 * total_heads, dimension, k_id); const int shift_v = RCtoFlat(h_id + total_heads, d_id, 2 * total_heads, dimension, k_id); const float k_d = IsNaNOrInf(key_value[shift_k], 0.0f); const float v_d = IsNaNOrInf(key_value[shift_v], 0.0f); float grad_k = 0.0f; float grad_v = 0.0f; for(int q_id = 0; q_id < total_q; q_id++) { if(mask_future != 0 && q_id > k_id) continue; const int shift_q = RCtoFlat(h_id, d_id, total_heads, dimension, q_id); const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0); const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f); const float q_d = IsNaNOrInf(query[shift_q], 0.0f); const float g_d = IsNaNOrInf(output_gr[shift_q], 0.0f); const float o_d = IsNaNOrInf(output[shift_q], 0.0f); const float D = LocalSum(IsNaNOrInf(g_d * o_d, 0.0f), 1, temp); const float s = LocalSum(IsNaNOrInf(q_d * k_d, 0.0f), 1, temp) / sqrt((float)dimension); const float p = IsNaNOrInf(exp(clamp(s - lse, -120.0f, 0.0f)), 0.0f); if(p == 0.0f) continue; const float dp = LocalSum(IsNaNOrInf(g_d * v_d, 0.0f), 1, temp); const float ds = IsNaNOrInf(p * (dp - D), 0.0f); grad_k += IsNaNOrInf(q_d * ds, 0.0f); grad_v += IsNaNOrInf(p * g_d, 0.0f); } key_value_gr[shift_k] = IsNaNOrInf(grad_k, 0.0f); key_value_gr[shift_v] = IsNaNOrInf(grad_v, 0.0f); } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFlashSTCA(__global const float *query, __global const float *X, __global float *logsumexp, __global float *output, const int dimension, const int total_X, const int mask_future ) { //--- init const int q_id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_q = get_global_size(0); const int total_loc = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; __local float4 temp4[LOCAL_ARRAY_SIZE]; //--- const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); float prev_max = MIN_VALUE; float sumexp = 0; float out = 0; for(int id = 0; id < total_X; id += total_loc) { int x_id = id + local_id; const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0); //--- Score float score = 0; if(x_id < total_X && (mask_future == 0 || q_id <= x_id)) { for(int d = 0; d < dimension; d += 4) { float4 q = IsNaNOrInf4((float4)( (d < dimension ? query[shift_q + d] : 0.0f), ((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f) ), 0.0f); float4 k = IsNaNOrInf4((float4)( (d < dimension ? X[shift_x + d] : 0.0f), ((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f), ((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f), ((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f) ), 0.0f); score += IsNaNOrInf(dot(q, k), 0.0f); } score /= sqrt((float)dimension); } else score = MIN_VALUE; //--- norm score float max = fmax(prev_max, LocalMax(score, 1, temp)); if(score > MIN_VALUE) score = exp(score - max); else score = 0.0f; if(sumexp == 0.0f) sumexp = LocalSum(score, 1, temp); else sumexp = IsNaNOrInf(exp(prev_max - max) * sumexp, 0.0f) + LocalSum(score, 1, temp); for(int d = 0; d < dimension; d += 4) { float4 val = (float4)0.0f; if(score > 0.0f && x_id < total_X) { float4 v = IsNaNOrInf4((float4)( (d < dimension ? X[shift_x + d] : 0.0f), ((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f), ((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f), ((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f) ), 0.0f); val = IsNaNOrInf4(v * score, 0.0f); } val = LocalSum4(val, 1, temp4); float add = 0.0f; int idx = local_id - d; if(idx >= 0 && idx < 4) { if(out != 0.0f) out = IsNaNOrInf(exp(prev_max - max) * out + val[idx], 0.0f); else out = val[idx]; } } prev_max = max; } if(local_id < dimension) { if(sumexp > 0.0f) output[shift_q + local_id] = IsNaNOrInf(out / sumexp, 0.0f); else output[shift_q + local_id] = 0.0f; } if(local_id == 0) { int shift_logse = RCtoFlat(q_id, h_id, total_q, total_heads, 0); if(sumexp > 0.0f) logsumexp[shift_logse] = IsNaNOrInf(prev_max + log(sumexp), 0.0f); else logsumexp[shift_logse] = 0.0f; } } //+------------------------------------------------------------------+ //| | //+------------------------------------------------------------------+ __kernel void MHFlashSTCAGrad(__global const float *query, __global float *query_gr, __global const float *X, __global float *X_gr, __global const float *logsumexp, __global const float *output, __global const float *output_gr, const int dimension, const int total_q, const int total_X, const int mask_future ) { const int id = get_global_id(0); const int local_id = get_local_id(1); const int h_id = get_global_id(2); const int total_loc = get_local_size(1); const int total_heads = get_global_size(2); //--- __local float temp[LOCAL_ARRAY_SIZE]; __local float4 temp4[LOCAL_ARRAY_SIZE]; //--- Query gradient: dQ[q, d] if(id < total_q) { float grad_q = 0.0f; const int q_id = id; const int shift_q = RCtoFlat(h_id, 0, total_heads, dimension, q_id); const int shift_lse = RCtoFlat(q_id, h_id, total_q, total_heads, 0); const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f); float D = 0; for(int d = 0; d < dimension; d += 4) { float4 g_d = IsNaNOrInf4((float4)( (d < dimension ? output_gr[shift_q + d] : 0.0f), ((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f) ), 0.0f); float4 o_d = IsNaNOrInf4((float4)( (d < dimension ? output[shift_q + d] : 0.0f), ((d + 1) < dimension ? output[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output[shift_q + d + 3] : 0.0f) ), 0.0f); D += IsNaNOrInf(dot(g_d, o_d), 0.0f); } for(int l_id = 0; l_id < total_X; l_id += total_loc) { int x_id = l_id + local_id; float ds = 0; if(x_id < total_X && (mask_future == 0 || q_id <= x_id)) { const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0); float score = 0; float dp = 0; for(int d = 0; d < dimension; d += 4) { float4 q_d = IsNaNOrInf4((float4)( (d < dimension ? query[shift_q + d] : 0.0f), ((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f) ), 0.0f); float4 x_d = IsNaNOrInf4((float4)( (d < dimension ? X[shift_x + d] : 0.0f), ((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f), ((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f), ((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f) ), 0.0f); score += IsNaNOrInf(dot(q_d, x_d), 0.0f); float4 g_d = IsNaNOrInf4((float4)( (d < dimension ? output_gr[shift_q + d] : 0.0f), ((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f) ), 0.0f); dp += IsNaNOrInf(dot(g_d, x_d), 0.0f); } score /= sqrt((float)dimension); const float p = IsNaNOrInf(exp(clamp(score - lse, -120.0f, 0.0f)), 0.0f); ds = IsNaNOrInf(p * (dp - D), 0.0f); } for(int d = 0; d < dimension; d += 4) { float4 x_d = (float4)0; if(x_id < total_X && (mask_future == 0 || q_id <= x_id)) { const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0); x_d = IsNaNOrInf4((float4)( (d < dimension ? X[shift_x + d] : 0.0f), ((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f), ((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f), ((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f) ), 0.0f); } float4 q_dg = LocalSum4(x_d * ds, 1, temp4); int idx = local_id - d; if(idx >= 0 && idx < 4) grad_q += q_dg[idx]; } } if(local_id < dimension) query_gr[shift_q + local_id] = IsNaNOrInf(grad_q, 0.0f); } //--- X gradients: dX[k, d] if(id < total_X && h_id == 0) { float grad_X = 0.0f; const int x_id = id; const int shift_x = RCtoFlat(x_id, 0, total_X, dimension, 0); for(int l_id = 0; l_id < total_q * total_heads; l_id += total_loc) { int loc = l_id + local_id; int h = loc / total_q; int q_id = loc % total_q; float ds = 0; float p = 0; if(h < total_heads && q_id < total_q && (mask_future == 0 || q_id <= x_id)) { const int shift_lse = RCtoFlat(q_id, h, total_q, total_heads, 0); const float lse = IsNaNOrInf(logsumexp[shift_lse], 0.0f); const int shift_q = RCtoFlat(h, 0, total_heads, dimension, q_id); float score = 0; float D = 0; float dp = 0; for(int d = 0; d < dimension; d += 4) { float4 q_d = IsNaNOrInf4((float4)( (d < dimension ? query[shift_q + d] : 0.0f), ((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f) ), 0.0f); float4 x_d = IsNaNOrInf4((float4)( (d < dimension ? X[shift_x + d] : 0.0f), ((d + 1) < dimension ? X[shift_x + d + 1] : 0.0f), ((d + 2) < dimension ? X[shift_x + d + 2] : 0.0f), ((d + 3) < dimension ? X[shift_x + d + 3] : 0.0f) ), 0.0f); score += IsNaNOrInf(dot(q_d, x_d), 0.0f); float4 g_d = IsNaNOrInf4((float4)( (d < dimension ? output_gr[shift_q + d] : 0.0f), ((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f) ), 0.0f); dp += IsNaNOrInf(dot(g_d, x_d), 0.0f); float4 o_d = IsNaNOrInf4((float4)( (d < dimension ? output[shift_q + d] : 0.0f), ((d + 1) < dimension ? output[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output[shift_q + d + 3] : 0.0f) ), 0.0f); D += IsNaNOrInf(dot(g_d, o_d), 0.0f); } const float p = IsNaNOrInf(exp(clamp(score - lse, -120.0f, 0.0f)), 0.0f); if(p != 0.0f) ds = IsNaNOrInf(p * (dp - D), 0.0f); } //--- for(int d = 0; d < dimension; d += 4) { float4 q_d = (float4)0; float4 g_d = (float4)0; if(h < total_heads && q_id < total_q && (mask_future == 0 || q_id <= x_id)) { const int shift_q = RCtoFlat(h, 0, total_heads, dimension, q_id); q_d = IsNaNOrInf4((float4)( (d < dimension ? query[shift_q + d] : 0.0f), ((d + 1) < dimension ? query[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? query[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? query[shift_q + d + 3] : 0.0f) ), 0.0f); g_d = IsNaNOrInf4((float4)( (d < dimension ? output_gr[shift_q + d] : 0.0f), ((d + 1) < dimension ? output_gr[shift_q + d + 1] : 0.0f), ((d + 2) < dimension ? output_gr[shift_q + d + 2] : 0.0f), ((d + 3) < dimension ? output_gr[shift_q + d + 3] : 0.0f) ), 0.0f); } float4 x_dg = LocalSum4(q_d * ds + g_d * p, 1, temp4); int idx = local_id - d; if(idx >= 0 && idx < 4) grad_X += x_dg[idx]; } } if(local_id < dimension) X_gr[shift_x + local_id] = IsNaNOrInf(grad_X, 0.0f); } } //+------------------------------------------------------------------+