NeuroBook/Include/realization/opencl_program.cl

<EFBFBD><EFBFBD>//+------------------------------------------------------------------+
//|                                                opencl_program.cl |
//|                                  Copyright 2021, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
//--- by default some GPU doesn't support TYPEs
//--- cl_khr_fp64 directive is used to enable work with TYPEs
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
//+------------------------------------------------------------------+
//| Activation functions                                             |
//+------------------------------------------------------------------+
//| Linear activation function                                       |
//| Parameter 'value' Weighted sum of initial data                   |
//|           'a' defines the angle of inclination of the line       |
//|           'b' - the vertical offset of the line                  |
//+------------------------------------------------------------------+
__kernel void LineActivation(__global TYPE* inputs,
                             __global TYPE* outputs,
                             const TYPE a, const TYPE b)
  {
   size_t i = get_global_id(0);
   outputs[i] = (a * inputs[i] + b);
  }
//+------------------------------------------------------------------+
//| Sigmoid activation function                                      |
//| Parameter 'value' Weighted sum of initial data                   |
//|           'a' stretches the range of values of the function      |
//|               from '0' to 'a'                                    |
//|           'b' shifts the resulting value                         |
//+------------------------------------------------------------------+
__kernel void SigmoidActivation(__global TYPE* inputs,
                                __global TYPE* outputs,
                                const TYPE a, const TYPE b)
  {
   size_t i = get_global_id(0);
   outputs[i] = a / (1 + exp(-inputs[i])) - b;
  }
//+------------------------------------------------------------------+
//| Derivative of Sigmoid activation function                        |
//| Parameter 'value' current point (result of feed forward).        |
//|           'a' stretches the range of values of the function from |
//|               '0' to 'a'                                         |
//|           'b' shifts the resulting value                         |
//+------------------------------------------------------------------+
__kernel void SigmoidDerivative(__global TYPE* outputs,
                                __global TYPE* output_gr,
                                __global TYPE* input_gr,
                                const TYPE a, const TYPE b
                               )
  {
   size_t i = get_global_id(0);
   if(a == 0)
      input_gr[i] = 0;
   else
     {
      TYPE z = clamp(outputs[i] + b, (TYPE)0, a);
      input_gr[i] = z * (1 - z / a) * output_gr[i];
     }
  }
//+------------------------------------------------------------------+
//| TANH activation function                                         |
//| Parameter 'value' Weighted sum of initial data                   |
//+------------------------------------------------------------------+
__kernel void TanhActivation(__global TYPE* inputs,
                             __global TYPE* outputs)
  {
   size_t i = get_global_id(0);
   outputs[i] = tanh(inputs[i]);
  }
//+------------------------------------------------------------------+
//| Derivative of TANH activation function                           |
//| Parameter 'value' current point (result of feed forward).        |
//+------------------------------------------------------------------+
__kernel void TanhDerivative(__global TYPE* outputs,
                             __global TYPE* output_gr,
                             __global TYPE* input_gr
                            )
  {
   size_t i = get_global_id(0);
   input_gr[i] = (1 - pow(outputs[i], 2)) * output_gr[i];
  }
//+------------------------------------------------------------------+
//| LReLU activation function                                        |
//| Parameter 'value' current point (result of feed forward).        |
//|           'a' leak parameter                                     |
//+------------------------------------------------------------------+
__kernel void LReLUActivation(__global TYPE* inputs,
                              __global TYPE* outputs,
                              const TYPE a)
  {
   size_t i = get_global_id(0);
   TYPE value = inputs[i];
   outputs[i] = (value > 0 ? value : a * value);
  }
//+------------------------------------------------------------------+
//| Derivative of LReLU activation function                           |
//| Parameter 'value' current point (result of feed forward).        |
//|           'a' leak parameter                                     |
//+------------------------------------------------------------------+
__kernel void LReLUDerivative(__global TYPE* outputs,
                              __global TYPE* output_gr,
                              __global TYPE* input_gr,
                              const TYPE a)
  {
   size_t i = get_global_id(0);
   input_gr[i] = (outputs[i] > 0 ? (TYPE)1 : a) * output_gr[i];
  }
//+------------------------------------------------------------------+
//| Swish activation function                                        |
//| Parameter 'value' Weighted sum of initial data                   |
//|           'b' affects the nonlinearity of the function           |
//+------------------------------------------------------------------+
__kernel void SwishActivation(__global TYPE* inputs,
                              __global TYPE* outputs,
                              const TYPE b)
  {
   size_t i = get_global_id(0);
   TYPE value = inputs[i];
   outputs[i] = value / (1 + exp(-b * value));
  }
//+------------------------------------------------------------------+
//| Derivative of Swish activation function                          |
//| Parameter 'value' current point (result of feed forward).        |
//|           'value_input' Weighted sum of initial data             |
//|           'b' affects the nonlinearity of the function           |
//+------------------------------------------------------------------+
__kernel void SwishDerivative(__global TYPE* outputs,
                              __global TYPE* output_gr,
                              __global TYPE* input_gr,
                              const TYPE b,
                              __global TYPE* inputs)
  {
   size_t i = get_global_id(0);
   TYPE by = b * outputs[i];
   input_gr[i] = (by + (1 - by) / (1 + exp(-b * inputs[i]))) * output_gr[i];
  }
//+------------------------------------------------------------------+
//| Transfer 4 elements of TYPE vector to TYPE4                      |
//| Parameter 'array' source array of data                           |
//|           'start' first position to copy                         |
//|           'step' step between elements to copy                   |
//|           'size' Size of source array                            |
//|         'shift' Shift in source array to the 1-st copied element |
//+------------------------------------------------------------------+
TYPE4 ToVect4(__global TYPE *array, int start, int step, int size, int shift)
  {
   TYPE4 result = (TYPE4)(0, 0, 0, 0);
   step = max(1, step);
   int st = start * step + shift;
   if(st < size)
     {
      int k = (size - shift + step - 1) / step;
      switch(k)
        {
         case 0:
            break;
         case  1:
            result = (TYPE4)(array[st], 0, 0, 0);
            break;
         case  2:
            result = (TYPE4)(array[st], array[st + step], 0, 0);
            break;
         case  3:
            result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], 0);
            break;
         default:
            result = (TYPE4)(array[st], array[st + step], array[st + 2 * step], array[st + 3 * step]);
            break;
        }
     }
   return result;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
TYPE Max4(TYPE4 vect, TYPE value)
  {
   TYPE result = fmax(vect.s0, value);
   result = fmax(vect.s1, result);
   result = fmax(vect.s2, result);
   return(fmax(vect.s3, result));
  }
//+------------------------------------------------------------------+
//| Transfer TYPE4 to 4 elements of TYPE vector                  |
//| Parameter 'array' target array of data                           |
//|           'value' source TYPE4 vector                          |
//|           'start' first position to copy in target array         |
//|           'step' step between elements in target array           |
//|           'size' Size of target array                            |
//|         'shift' Shift in target array to the 1-st copied element |
//+------------------------------------------------------------------+
void D4ToArray(__global TYPE *array, TYPE4 value, int start, int step, int size, int shift)
  {
   step = max(1, step);
   int st = start * step + shift;
   if(st < size)
     {
      int k = (size - shift) % step;
      k = (size - shift - k) / step - start + (k > 0 ? 1 : 0);
      switch(k)
        {
         case  3:
            array[st + 2 * step] = value.s2;
         case  2:
            array[st + step] = value.s1;
         case  1:
            array[st] = value.s0;
            break;
         default:
            array[st + 3 * step] = value.s3;
            array[st + 2 * step] = value.s2;
            array[st + step] = value.s1;
            array[st] = value.s0;
            break;
        }
     }
   return;
  }
//+------------------------------------------------------------------+
//| Kernel Feed Forward of perceptron                                |
//| Parameter 'inputs' vector of inputs data                         |
//|           'weights' matrix of weights                            |
//|           'outputs' output data                                  |
//|           'inputs_total' size of inputs vector                   |
//+------------------------------------------------------------------+
__kernel void PerceptronFeedForward(__global TYPE *inputs,
                                    __global TYPE *weights,
                                    __global TYPE *outputs,
                                    int inputs_total)
  {
   const int n = get_global_id(0);
   const int weights_total = get_global_size(0) * (inputs_total + 1);
   int shift = n * (inputs_total + 1);
   TYPE s = weights[shift + inputs_total];
   for(int i = 0; i < inputs_total; i += 4)
      s += dot(ToVect4(inputs, i, 1, inputs_total, 0), ToVect4(weights, i, 1, weights_total, shift));
   outputs[n] = s;
  }
//+------------------------------------------------------------------+
//| Kernel of calculation output gradients                           |
//| Parameter 'target' vector of target data                         |
//|           'outputs' vector of previous FF outputs data           |
//|           'gradients' vector of gradients                        |
//|           'loss_function' type of loss function                  |
//+------------------------------------------------------------------+
__kernel void CalcOutputGradient(__global TYPE *target,
                                 __global TYPE *outputs,
                                 __global TYPE *gradients,
                                 int loss_function)
  {
   const int n = get_global_id(0);
   switch(loss_function)
     {
      case 0:
         gradients[n] = target[n] - outputs[n];
         break;
      case 1:
         gradients[n] = 2 * (target[n] - outputs[n]);
         break;
      case 2:
         gradients[n] = -target[n] / (outputs[n] + 1e-37f) * log(outputs[n] + 1e-37f);
         break;
      case 3:
         gradients[n] = (target[n] - outputs[n]) / (outputs[n] * (outputs[n] - 1) + 1e-37f);
         break;
      default:
         gradients[n] = target[n] - outputs[n];
         break;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradient(__global TYPE *gradient_inputs,
                                 __global TYPE *weights,
                                 __global TYPE *gradients,
                                 int outputs_total)
  {
   const int n = get_global_id(0);
   const int inputs_total = get_global_size(0);
   int weights_total = (inputs_total + 1) * outputs_total;
//---
   TYPE grad = 0;
   for(int o = 0; o < outputs_total; o += 4)
      grad += dot(ToVect4(gradients, o, 1, outputs_total, 0), ToVect4(weights, o, (inputs_total + 1), weights_total, n));
   gradient_inputs[n] = grad;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void CalcDeltaWeights(__global TYPE *inputs,
                               __global TYPE *delta_weights,
                               __global TYPE *gradients)
  {
   const int n = get_global_id(0);
   const int outputs_total = get_global_size(0);
   const int i = get_global_id(1);
   const int inputs_total = get_global_size(1);
//---
   TYPE grad = gradients[n];
   int shift = n * (inputs_total + 1);
   delta_weights[shift + i] = inputs[i] * grad + delta_weights[shift + i];
   if(i == 0)
      delta_weights[shift + inputs_total] += grad;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void SGDUpdate(__global TYPE *delta_weights,
                        __global TYPE *weights,
                        int total,
                        int batch_size,
                        TYPE learningRate,
                        TYPE Lambda1,
                        TYPE Lambda2
                       )
  {
   int start = 4 * get_global_id(0);
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0);
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE lr = learningRate / ((TYPE)batch_size);
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   weights4 += (TYPE4)(lr) * delta4;
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void MomentumUpdate(__global TYPE *delta_weights,
                             __global TYPE *weights,
                             __global TYPE *momentum,
                             int total, int batch_size,
                             TYPE learningRate,
                             TYPE beta,
                             TYPE Lambda1, TYPE Lambda2)
  {
   int start = 4 * get_global_id(0);
//---
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   momentum4 = (TYPE4)(learningRate) * delta4 + (TYPE4)(beta) * momentum4;
   weights4 += momentum4;
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(momentum, momentum4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void AdaGradUpdate(__global TYPE *delta_weights,
                            __global TYPE *weights,
                            __global TYPE *momentum,
                            int total, int batch_size,
                            TYPE learningRate,
                            TYPE Lambda1, TYPE Lambda2)
  {
   int start = 4 * get_global_id(0);
//---
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
//---
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   momentum4 = momentum4 + pow(delta4, 2);
   weights4 += learningRate / sqrt(momentum4 + 1.0e-37f);
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(momentum, momentum4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void RMSPropUpdate(__global TYPE *delta_weights,
                            __global TYPE *weights,
                            __global TYPE *momentum,
                            int total, int batch_size,
                            TYPE learningRate,
                            TYPE beta,
                            TYPE Lambda1, TYPE Lambda2)
  {
   int start = 4 * get_global_id(0);
//---
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE4 momentum4 = ToVect4(momentum, start, 1, total, 0);
//---
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   momentum4 = beta * momentum4 + (1 - beta) * pow(delta4, 2);
   weights4 += delta4 * learningRate / (sqrt(momentum4) + 1.0e-37f);
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(momentum, momentum4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void AdaDeltaUpdate(__global TYPE *delta_weights,
                             __global TYPE *weights,
                             __global TYPE *momentumW,
                             __global TYPE *momentumG,
                             int total, int batch_size,
                             TYPE beta1, TYPE beta2,
                             TYPE Lambda1, TYPE Lambda2)
  {
   int start = 4 * get_global_id(0);
//---
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE4 momentumW4 = ToVect4(momentumW, start, 1, total, 0);
   TYPE4 momentumG4 = ToVect4(momentumG, start, 1, total, 0);
//---
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   momentumW4 = beta1 * momentumW4 + (1 - beta1) * pow(weights4, 2);
   momentumG4 = beta2 * momentumG4 + (1 - beta2) * pow(delta4, 2);
   weights4 += delta4 * sqrt(momentumW4) / (sqrt(momentumG4) + 1.0e-37f);
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(momentumW, momentumW4, start, 1, total, 0);
   D4ToArray(momentumG, momentumG4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void AdamUpdate(__global TYPE *delta_weights,
                         __global TYPE *weights,
                         __global TYPE *momentumM,
                         __global TYPE *momentumV,
                         int total, int batch_size,
                         TYPE learningRate,
                         TYPE beta1, TYPE beta2,
                         TYPE Lambda1, TYPE Lambda2)
  {
   int start = 4 * get_global_id(0);
//---
   TYPE4 delta4 = ToVect4(delta_weights, start, 1, total, 0) / ((TYPE4)(batch_size));
   TYPE4 weights4 = ToVect4(weights, start, 1, total, 0);
   TYPE4 momentumM4 = ToVect4(momentumM, start, 1, total, 0);
   TYPE4 momentumV4 = ToVect4(momentumV, start, 1, total, 0);
//---
   momentumM4 = beta1 * momentumM4 + (1 - beta1) * delta4;
   momentumV4 = beta2 * momentumV4 + (1 - beta2) * pow(delta4, 2);
   TYPE4 m = momentumM4 / (1 - beta1);
   TYPE4 v = momentumV4 / (1 - beta2);
   weights4 -= (TYPE4)(Lambda1) + Lambda2 * weights4;
   weights4 += learningRate * m / (sqrt(v) + 1.0e-37f);
   D4ToArray(weights, weights4, start, 1, total, 0);
   D4ToArray(momentumM, momentumM4, start, 1, total, 0);
   D4ToArray(momentumV, momentumV4, start, 1, total, 0);
   D4ToArray(delta_weights, (TYPE4)(0), start, 1, total, 0);
  }
//+------------------------------------------------------------------+
//| Feed-forward kernel of the convolutional layer                   |
//| Parameters: 'inputs' input data vector                           |
//|             'weights' weight matrix                              |
//|             'outputs' vector of results                          |
//|             'inputs_total' size of input data vector             |
//|             'window' size of input data analysis window          |
//|             'step' window moving step                            |
//|             'window_out' number of filters                       |
//+------------------------------------------------------------------+
__kernel void ConvolutionFeedForward(__global TYPE *inputs,
                                     __global TYPE *weights,
                                     __global TYPE *outputs,
                                     int inputs_total,
                                     int window,
                                     int step,
                                     int window_out,
                                     int transposed_out)
  {
   const int n = get_global_id(0);
   const int neurons = get_global_size(0);
   const int weights_total = (window + 1) * window_out;
   int shift = n * step;
   for(int w = 0; w < window_out; w++)
     {
      int out = (transposed_out == 1 ? w + n * window_out : w * neurons + n);
      int shift_weights = w * (window + 1) ;
      if((shift_weights + window) >= weights_total)
         break;
      TYPE s = weights[shift_weights + window];
      for(int i = 0; i < window; i += 4)
         s += dot(ToVect4(inputs, i, 1, inputs_total, shift),
                  ToVect4(weights, i, 1, shift_weights + window, shift_weights));
      outputs[out] = s;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void ConvolutionCalcHiddenGradient(__global TYPE *gradient_inputs,
      __global TYPE *weights,
      __global TYPE *gradients,
      int window,
      int step,
      int window_out,
      int neurons,
      int transposed_out)
  {
   const int n = get_global_id(0);
   const int inputs_total = get_global_size(0);
   int weights_total = (window + 1) * window_out;
//---
   TYPE grad = 0;
   int w_start = n % step;
   int r_start = max((n - window + step) / step, 0);
   int total = (window - w_start + step - 1) / step;
   total = min((n + step) / step, total);
   for(int i = 0; i < total; i ++)
     {
      int row = r_start + i;
      if(row >= neurons)
         break;
      for(int wo = 0; wo < window_out; wo++)
        {
         int shift_g = (transposed_out == 1 ? row * window_out + wo : row + wo * neurons);
         int shift_w = w_start + (total - i - 1) * step + wo * (window + 1);
         grad += gradients[shift_g] * weights[shift_w];
        }
     }
   gradient_inputs[n] = grad;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void ConvolutionCalcDeltaWeights(__global TYPE *inputs,
      __global TYPE *delta_weights,
      __global TYPE *gradients,
      int inputs_total,
      int step,
      int neurons,
      int transposed_out)
  {
   const int inp_w = get_global_id(0);
   const int w = get_global_id(1);
   const int window = get_global_size(0) - 1;
   const int window_out = get_global_size(1);
//---
   int shift_delt = w * (window + 1) + inp_w;
   TYPE value = 0;
   if(inp_w == window)
     {
      for(int n = 0; n < neurons; n ++)
         value += gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
     }
   else
      for(int n = 0; n < neurons; n ++)
        {
         int shift_inp = n * step + inp_w;
         if(shift_inp >= inputs_total)
            break;
         value += inputs[shift_inp] * gradients[transposed_out == 1 ? w + n*window_out : w * neurons + n];
        }
   delta_weights[shift_delt] += value;
  }
//+------------------------------------------------------------------+
//| Feed-forward kernel of the pooling layer                         |
//| Parameters: 'inputs' input data vector                           |
//|             'outputs' vector of results                          |
//|             'inputs_total' size of input data vector             |
//|             'input_neurons' vector size of 1st input data filter |
//|             'window' size of input data analysis window          |
//|             'step' window moving step                            |
//|             'activation' type of activation function             |
//+------------------------------------------------------------------+
__kernel void ProofFeedForward(__global TYPE *inputs,
                               __global TYPE *outputs,
                               int inputs_total,
                               int input_neurons,
                               int window,
                               int step,
                               int activation)
  {
   const int n = get_global_id(0);
   const int w = get_global_id(1);
   const int neurons = get_global_size(0);
   const int window_out = get_global_size(1);
   int shift = n * step;
   int out = w * neurons + n;
   int shift_inp = w * input_neurons;
   TYPE s = 0;
   TYPE k = (TYPE)1 / (TYPE)window;
   TYPE4 k4 = (TYPE4)(k);
   for(int i = 0; i < window; i += 4)
      switch(activation)
        {
         case 0:
            s += dot(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift),
                     k4);
            break;
         case 1:
            s = Max4(ToVect4(inputs, i, 1, min(shift_inp + input_neurons, inputs_total), shift_inp + shift), s);
            break;
         default:
            break;
        }
   outputs[out] = s;
  }
//+------------------------------------------------------------------+
//| Backpropagation kernel of the pooling layer                      |
//| Parameters: 'inputs' input data vector                           |
//|             'gradient_inputs' previous layer gradients vector    |
//|             'outputs' vector of results                          |
//|             'gradients' current layer gradients vector           |
//|             'inputs_total' size of input data vector             |
//|             'outputs_total' size of outputs vector               |
//|             'window' size of input data analysis window          |
//|             'step' window moving step                            |
//|             'neurons' vector size if 1st filter of outputs       |
//|             'activation' type of activation function             |
//+------------------------------------------------------------------+
__kernel void ProofCalcHiddenGradient(__global TYPE *inputs,
                                      __global TYPE *gradient_inputs,
                                      __global TYPE *outputs,
                                      __global TYPE *gradients,
                                      int inputs_total,
                                      int outputs_total,
                                      int window,
                                      int step,
                                      int neurons,
                                      int activation)
  {
   const int n = get_global_id(0);
   const int w = get_global_id(1);
   const int input_neurons = get_global_size(0);
   const int window_out = get_global_size(1);
//---
   int start = max((n - window + step) / step, 0);
   int stop = min((n + step - 1) / step + 1, neurons);
   TYPE grad = 0;
   int shift_inp = w * input_neurons + n;
   if(shift_inp >= inputs_total)
      return;
   TYPE inp = inputs[shift_inp];
   int shift_out = w * neurons;
   for(int o = start; o < stop; o ++)
     {
      int shift_g = shift_out + o;
      if(shift_g >= outputs_total)
         break;
      switch(activation)
        {
         case 0:
            grad += gradients[shift_g] / (TYPE)window;
            break;
         case 1:
            grad += (outputs[shift_g] == inp ? gradients[shift_g] : 0);
            break;
         default:
            break;
        }
     }
   gradient_inputs[shift_inp] = grad;
  }
//+------------------------------------------------------------------+
//| LSTM block feed-forward kernel                                   |
//| Parameters: 'forgetgate' forget gate                             |
//|             'inputgate' input gate                               |
//|             'outputgate' output gate                             |
//|            'newcontent' new content                              |
//|             'memory' memory stream                               |
//|             'hiddenstate' hidden state stream                    |
//|             'outputs_total' number of elements in data stream    |
//+------------------------------------------------------------------+
__kernel void LSTMFeedForward(__global TYPE *forgetgate,
                              __global TYPE *inputgate,
                              __global TYPE *outputgate,
                              __global TYPE *newcontent,
                              __global TYPE *memory,
                              __global TYPE *hiddenstate,
                              int outputs_total)
  {
   const int n = get_global_id(0);
   const int shift = n * 4;
   TYPE4 fg = ToVect4(forgetgate, shift, 1, outputs_total, 0);
   TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
   TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
   TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
   TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
//---
   TYPE4 temp = mem * fg;
   temp += ig * nc;
   D4ToArray(memory, temp, shift, 1, outputs_total, 0);
   temp = tanh(temp) * og;
   D4ToArray(hiddenstate, temp, shift, 1, outputs_total, 0);
  }
//+------------------------------------------------------------------+
//| LSTM block backpropagation kernel                                |
//| Parameters: 'outputs' vector of outputs                          |
//|             'gradients' current layer gradients vector           |
//|             'inputgate' input gate                               |
//|             'outputgate' output gate                             |
//|            'newcontent' new content                              |
//|             'memory' memory stream                               |
//|             'fg_gradients' forget gate gradient                  |
//|             'ig_gradients' input gate gradient                   |
//|             'og_gradients' output gate gradient                  |
//|             'nc_gradients' new content gradient                  |
//|             'outputs_total' size of outputs vector               |
//+------------------------------------------------------------------+
__kernel void LSTMCalcHiddenGradient(__global TYPE *outputs,
                                     __global TYPE *gradients,
                                     __global TYPE *inputgate,
                                     __global TYPE *outputgate,
                                     __global TYPE *newcontent,
                                     __global TYPE *memory,
                                     __global TYPE *fg_gradients,
                                     __global TYPE *ig_gradients,
                                     __global TYPE *og_gradients,
                                     __global TYPE *nc_gradients,
                                     int outputs_total)
  {
   const int n = get_global_id(0);
   int shift = n * 4;
//---
   TYPE4 out = ToVect4(outputs, shift, 1, outputs_total, 0);
   TYPE4 grad = ToVect4(gradients, shift, 1, outputs_total, 0);
   TYPE4 ig = ToVect4(inputgate, shift, 1, outputs_total, 0);
   TYPE4 og = ToVect4(outputgate, shift, 1, outputs_total, 0);
   TYPE4 nc = ToVect4(newcontent, shift, 1, outputs_total, 0);
   TYPE4 mem = ToVect4(memory, shift, 1, outputs_total, 0);
//---
   TYPE4 m = out / (og + 1.0e-37f);
//--- OutputGate gradient
   TYPE4 temp = grad * m;
   D4ToArray(og_gradients, temp, shift, 1, outputs_total, 0);
//--- @0485=B ?0<OB8 c:>@@5:B8@C5< =0 ?@>872>4=CN TANH
   grad = grad * og * (1 - pow(m, 2));
//--- InputGate gradient
   temp = grad * nc;
   D4ToArray(ig_gradients, temp, shift, 1, outputs_total, 0);
//--- NewContent gradient
   temp = grad * ig;
   D4ToArray(nc_gradients, temp, shift, 1, outputs_total, 0);
//--- ForgetGates gradient
   temp = grad * mem;
   D4ToArray(fg_gradients, temp, shift, 1, outputs_total, 0);
  }
//+------------------------------------------------------------------+
//| Kernel calculating Self-Attention block dependency coefficients  |
//| Parameters: 'querys' tensor of queries                           |
//|             'keys' tensor of keys                                |
//|             'values' tensor of values                            |
//|             'scores' matrix of dependency coefficients           |
//|             'inputs' tensor of input data                        |
//|             'outputs' tensor of output data                      |
//|             'window' size of input data analysis window          |
//|             'key_size' size of key vector for one element        |
//+------------------------------------------------------------------+
__kernel void AttentionFeedForward(__global TYPE *querys,
                                   __global TYPE *keys,
                                   __global TYPE *scores,
                                   __global TYPE *values,
                                   __global TYPE *outputs,
                                   int window,
                                   int key_size,
                                   int mask)
  {
   const int q = get_global_id(0);
   const int units = get_global_size(0);
   const int h = get_global_id(1);
   const int heads = get_global_size(1);
   int shift_query = key_size * (q * heads + h);
   int shift_scores = units * (q * heads + h);
   TYPE summ = 0;
   for(int s = 0; s < units; s++)
     {
      TYPE score = 0;
      if(mask > 0 && s > q)
        {
         scores[shift_scores + s] = score;
         continue;
        }
      int shift_key = key_size * (s * heads + h);
      for(int k = 0; k < key_size; k ++)
         score += querys[shift_query + k] * keys[shift_key + k];
      score = exp(score / sqrt((TYPE)key_size));
      summ += score;
      scores[shift_scores + s] = score;
     }
   for(int s = 0; s < units; s++)
      scores[shift_scores + s] /= summ;
//---
   shift_query = window * (q * heads + h);
   for(int i = 0; i < window; i++)
     {
      TYPE query = 0;
      for(int v = 0; v < units; v++)
         query += values[window * (v * heads + h) + i] * scores[shift_scores + v];
      outputs[shift_query + i] = query;
     }
  }
//+------------------------------------------------------------------+
//| Kernel to propagate gradient inside the Self-Attention block     |
//| up to the level of dependency coefficient matrix Score           |
//| Parameters: 'values' tensor of values                            |
//|             'values_grad' tensor of gradients at level of values |
//|             'scores' matrix of dependency coefficients           |
//|             'scores_grad' matrix of dependency coeff. matrix     |
//|             'outputs' tensor of output data                      |
//|             'outputs_grad' tensor of gradients at output level   |
//|             'window' size of input data analysis window          |
//+------------------------------------------------------------------+
__kernel void AttentionCalcScoreGradient(__global TYPE *scores,
      __global TYPE *scores_grad,
      __global TYPE *values,
      __global TYPE *values_grad,
      __global TYPE *outputs_grad,
      __global TYPE *scores_temp,
      int window)
  {
   const int q = get_global_id(0);
   const int units = get_global_size(0);
   const int h = get_global_id(1);
   const int heads = get_global_size(1);
   int shift_value = window * (q * heads + h);
   int shift_score = units * (q * heads + h);
//--- Gradient propagation to Values
   for(int i = 0; i < window; i ++)
     {
      TYPE grad = 0;
      for(int g = 0; g < units; g++)
         grad += scores[units * (g * heads + h) + q] * outputs_grad[window * (g * heads + h) + i];
      values_grad[shift_value + i] = grad;
     }
//--- Gradient propagation to Score
   for(int k = 0; k < units; k++)
     {
      TYPE grad = 0;
      for(int i = 0; i < window; i++)
         grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
      scores_temp[shift_score + k] = grad;
     }
//--- Adjust by Softmax derivative
   for(int k = 0; k < units; k++)
     {
      TYPE grad = 0;
      TYPE score = scores[shift_score + k];
      for(int i = 0; i < units; i++)
         grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
      scores_grad[shift_score + k] = grad;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void AttentionCalcHiddenGradient(__global TYPE *querys,
      __global TYPE *querys_grad,
      __global TYPE *keys,
      __global TYPE *keys_grad,
      __global TYPE *scores_grad,
      int key_size)
  {
   const int q = get_global_id(0);
   const int units = get_global_size(0);
   const int h = get_global_id(1);
   const int heads = get_global_size(1);
   int shift_query = key_size * (q * heads + h);
   int shift_score = units * (q * heads + h);
//--- Gradient prpagation to Querys and Keys
   const TYPE k = 1 / sqrt((TYPE)key_size);
//---
   for(int i = 0; i < key_size; i++)
     {
      TYPE grad_q = 0;
      TYPE grad_k = 0;
      for(int s = 0; s < units; s++)
        {
         grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
         grad_k += querys[key_size * (s * heads + h) + i] * scores_grad[units * (s * heads + h) + q];
        }
      querys_grad[shift_query + i] = grad_q * k;
      keys_grad[shift_query + i] = grad_k * k;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void GPTFeedForward(__global TYPE *querys,
                             __global TYPE *keys,
                             __global TYPE *scores,
                             __global TYPE *values,
                             __global TYPE *outputs,
                             int key_size,
                             int units,
                             int current)
  {
   const int h = get_global_id(0);
   const int heads = get_global_size(0);
   int shift_query = key_size * h;
   int shift_scores = units * h;
   TYPE summ = 0;
   for(int s = 0; s < units; s++)
     {
      TYPE score = 0;
      int shift_key = key_size * (s * heads + h);
      for(int k = 0; k < key_size; k ++)
        {
         if(s == current)
            keys[shift_key + k] = querys[shift_query + k + heads * key_size];
         score += querys[shift_query + k] * keys[shift_key + k];
        }
      score = exp(score / sqrt((TYPE)key_size));
      summ += score;
      scores[shift_scores + s] = score;
     }
   for(int s = 0; s < units; s++)
      scores[shift_scores + s] /= summ;
//---
   shift_query = key_size * h;
   for(int i = 0; i < key_size; i++)
     {
      TYPE query = 0;
      for(int v = 0; v < units; v++)
        {
         if(v == current)
            values[key_size * (v * heads + h) + i] = querys[(2 * heads + h) * key_size + i];
         query += values[key_size * (v * heads + h) + i] * scores[shift_scores + v];
        }
      outputs[shift_query + i] = query;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void GPTCalcScoreGradient(__global TYPE *scores,
                                   __global TYPE *scores_grad,
                                   __global TYPE *values,
                                   __global TYPE *values_grad,
                                   __global TYPE *outputs_grad,
                                   __global TYPE *scores_temp,
                                   int window,
                                   int units,
                                   int current)
  {
   const int h = get_global_id(0);
   const int heads = get_global_size(0);
   int shift_value = window * (2 * heads + h);
   int shift_score = units * h;
//--- Gradient propagation to Values
   for(int i = 0; i < window; i ++)
      values_grad[shift_value + i] = scores[units * h + current] * outputs_grad[window * h + i];
//--- Gradient propagation to Score
   for(int k = 0; k < units; k++)
     {
      TYPE grad = 0;
      for(int i = 0; i < window; i++)
         grad += outputs_grad[shift_value + i] * values[window * (k * heads + h) + i];
      scores_temp[shift_score + k] = grad;
     }
//--- Adjust by Softmax derivative
   for(int k = 0; k < units; k++)
     {
      TYPE grad = 0;
      TYPE score = scores[shift_score + k];
      for(int i = 0; i < units; i++)
         grad += scores[shift_score + i] * ((int)(i == k) - score) * scores_temp[shift_score + i];
      scores_grad[shift_score + k] = grad;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void GPTCalcHiddenGradient(__global TYPE *querys,
                                    __global TYPE *querys_grad,
                                    __global TYPE *keys,
                                    __global TYPE *scores_grad,
                                    int key_size,
                                    int units,
                                    int current)
  {
   const int h = get_global_id(0);
   const int heads = get_global_size(0);
   int shift_query = key_size * h;
   int shift_key = key_size * (heads + h);
   int shift_score = units * h;
//--- Gradient prpagation to Querys and Keys
   const TYPE k = 1 / sqrt((TYPE)key_size);
//---
   for(int i = 0; i < key_size; i++)
     {
      TYPE grad_q = 0;
      TYPE grad_k = 0;
      for(int s = 0; s < units; s++)
        {
         grad_q += keys[key_size * (s * heads + h) + i] * scores_grad[shift_score + s];
         if(s == current)
            grad_k += querys[key_size * h + i] * scores_grad[units * h + current];
        }
      querys_grad[shift_query + i] = grad_q * k;
      querys_grad[shift_key + i] = grad_k * k;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void BatchNormFeedForward(__global TYPE *inputs,
                                   __global TYPE *options,
                                   __global TYPE *weights,
                                   __global TYPE *output,
                                   int batch,
                                   int total)
  {
   int n = get_global_id(0);
   if(batch <= 1)
     {
      D4ToArray(output, ToVect4(inputs, n * 4, 1, total, 0), n * 4, 1, total, 0);
      return;
     }
   int shift = n * 4;
   int shift_options = n * 3 * 4;
   int shift_weights = n * 2 * 4;
//---
   TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
   TYPE4 mean = ToVect4(options, shift, 3, total * 3, 0) * ((TYPE)batch - 1) + inp ;
   if(options[shift_options ] > 0 && options[shift_options + 1] > 0)
      mean /= (TYPE4)batch;
   TYPE4 delt = inp - mean;
   TYPE4 variance = ToVect4(options, shift, 3, total * 3, 1) * ((TYPE)batch - 1) + pow(delt, 2);
   if(options[shift_options + 1] > 0)
      variance /= (TYPE4)batch;
   TYPE4 nx = delt / sqrt(variance + 1e-37f);
//---
   if(weights[shift_weights] == 0)
      D4ToArray(weights, (TYPE4)1, shift, 2, total * 2, 0);
//---
   TYPE4 res = ToVect4(weights, shift, 2, total * 2, 0) * nx + ToVect4(weights, shift, 2, total * 2, 1);
//---
   D4ToArray(options, mean, shift, 3, total * 3, 0);
   D4ToArray(options, variance, shift, 3, total * 3, 1);
   D4ToArray(options, nx, shift, 3, total * 3, 2);
   D4ToArray(output, res, shift, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void BatchNormCalcHiddenGradient(__global TYPE *options,
      __global TYPE *gradient,
      __global TYPE *inputs,
      __global TYPE *gradient_inputs,
      __global TYPE *weights,
      int batch,
      int total
                                         )
  {
   int n = get_global_id(0);
   int shift = n * 4;
   if(batch <= 1)
     {
      D4ToArray(gradient_inputs, ToVect4(gradient, shift, 1, total, 0), shift, 1, total, 0);
      return;
     }
//---
   TYPE4 inp = ToVect4(inputs, shift, 1, total, 0);
   TYPE4 gnx = ToVect4(gradient, shift, 1, total, 0) * ToVect4(weights, shift, 2, total * 2, 0);
   TYPE4 temp = 1 / sqrt(ToVect4(options, shift, 3, total * 3, 1) + 1e-37f);
   TYPE4 delt = inp - ToVect4(options, shift, 3, total * 3, 0);
   TYPE4 gvar = delt / (-2 * pow(ToVect4(options, shift, 3, total * 3, 1) + 1.0e-37f, 3.0f / 2.0f)) * gnx;
   TYPE4 gmu = (-temp) * gnx - gvar * 2 * delt / (TYPE4)batch;
   TYPE4 gx = temp * gnx + gmu / (TYPE4)batch + gvar * 2 * delt / (TYPE4)batch;
//---
   D4ToArray(gradient_inputs, gx, shift, 1, total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void BatchNormCalcDeltaWeights(__global TYPE *options,
                                        __global TYPE *delta_weights,
                                        __global TYPE *gradients)
  {
   const int n = get_global_id(0);
   int shift_options = n * 3;
   int shift_weights = n * 2;
//---
   TYPE grad = gradients[n];
   delta_weights[shift_weights] += grad * options[shift_options + 2];
   delta_weights[shift_weights + 1] += grad;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void MaskMult(__global TYPE *inputs,
                       __global TYPE *mask,
                       __global TYPE *outputs,
                       int outputs_total)
  {
   const int n = get_global_id(0) * 4;
//---
   TYPE4 out = ToVect4(inputs, n, 1, outputs_total, 0) * ToVect4(mask, n, 1, outputs_total, 0);
   D4ToArray(outputs, out, n, 1, outputs_total, 0);
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void Sum(__global TYPE *inputs1,
                  __global TYPE *inputs2,
                  __global TYPE *outputs)
  {
   const int n = get_global_id(0);
//---
   outputs[n] = inputs1[n] + inputs2[n];
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void LayerNormalize(__global TYPE* inputs,
                             __global TYPE* outputs,
                             __global TYPE* stds,
                             const int total,
                             const int std_shift)
  {
   uint i = (uint)get_global_id(0);
   uint l = (uint)get_local_id(0);
   uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
   __local TYPE temp[LOCAL_SIZE];
//---
   uint count = 0;
   do
     {
      uint shift = count * ls + l;
      temp[l] = (count > 0 ? temp[l] : 0) + (shift < total ? inputs[shift] : 0);
      count++;
     }
   while((count * ls + l) < total);
   temp[l] /= (TYPE)total;
   barrier(CLK_LOCAL_MEM_FENCE);
   count = ls;
   do
     {
      count = (count + 1) / 2;
      temp[l] += (l < count ? temp[l + count] : 0);
      barrier(CLK_LOCAL_MEM_FENCE);
     }
   while(count > 1);
//---
   TYPE mean = (TYPE) temp[0];
//---
   count = 0;
   do
     {
      uint shift = count * ls + l;
      temp[l] = (count > 0 ? temp[l] : 0) + (shift < total ? (TYPE)pow(inputs[shift] - mean, 2) : 0);
      count++;
     }
   while((count * ls + l) < total);
   temp[l] /= (TYPE)total;
   barrier(CLK_LOCAL_MEM_FENCE);
   count = ls;
   do
     {
      count = (count + 1) / 2;
      temp[l] += (l < count ? temp[l + count] : 0);
      barrier(CLK_LOCAL_MEM_FENCE);
     }
   while(count > 1);
//---
   TYPE std = (TYPE)sqrt(temp[0]);
   if(l == 0)
      stds[std_shift] = std;
   count = 0;
   while((count * ls + l) < total)
     {
      uint shift = count * ls + l;
      outputs[shift] = (inputs[shift] - mean) / (std + 1e-37f);
      count++;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void LayerNormalizeGradient(__global TYPE* outputs,
                                     __global TYPE* out_gradient,
                                     __global TYPE* inp_gradient,
                                     __global TYPE* stds,
                                     const int total,
                                     const int std_shift)
  {
   uint i = (uint)get_global_id(0);
   uint l = (uint)get_local_id(0);
//---
   uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
   __local TYPE dSTD[LOCAL_SIZE];
   __local TYPE dMean1[LOCAL_SIZE];
   __local TYPE dMean2[LOCAL_SIZE];
   uint count = 0;
   do
     {
      uint shift = count * ls + l;
      dSTD[l] = (count > 0 ? dSTD[l] : 0) - (shift < total ? out_gradient[shift] * outputs[shift] / (2 * (pow(stds[std_shift], (TYPE)2) + 1e-37f)) : 0);
      dMean1[l] = (count > 0 ? dMean1[l] : 0) - (shift < total ? out_gradient[shift] / (stds[std_shift] + 1e-37f) : 0);
      dMean2[l] = (count > 0 ? dMean2[l] : 0) - (shift < total ? 2 * outputs[shift] * stds[std_shift] / (TYPE)total : 0);
      count++;
     }
   while((count * ls + l) < total);
   barrier(CLK_LOCAL_MEM_FENCE);
   count = ls;
   do
     {
      count = (count + 1) / 2;
      dSTD[l] += (l < count ? dSTD[l + count] : 0);
      dMean1[l] += (l < count ? dMean1[l + count] : 0);
      dMean2[l] += (l < count ? dMean2[l + count] : 0);
      barrier(CLK_LOCAL_MEM_FENCE);
     }
   while(count > 1);
//---
   TYPE dstd = dSTD[0];
   TYPE dmean = dMean1[0] + dstd * dMean2[0];
//---
   count = 0;
   while((count * ls + l) < total)
     {
      uint shift = count * ls + l;
      inp_gradient[shift] = out_gradient[shift] / (stds[std_shift] + 1e-32f) + (2 * dstd * outputs[shift] * stds[std_shift]  + dmean) / total;
      count++;
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void SoftMaxActivation(__global TYPE* inputs,
                                __global TYPE* outputs,
                                const ulong total)
  {
   uint i = (uint)get_global_id(0);
   uint l = (uint)get_local_id(0);
   uint h = (uint)get_global_id(1);
   uint ls = min((uint)get_local_size(0), (uint)LOCAL_SIZE);
//---
   __local TYPE temp[LOCAL_SIZE];
   uint count = 0;
   for(count = l; (count < total && l < ls); count += ls)
     {
      uint shift = h * total + count;
      temp[l] = (count > l ? temp[l] : 0) + exp(inputs[shift]);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
   count = ls;
   do
     {
      count = (count + 1) / 2;
      temp[l] += (l < count && (l + count) < ls ? temp[l + count] : 0);
      barrier(CLK_LOCAL_MEM_FENCE);
     }
   while(count > 1);
//---
   TYPE sum = temp[0];
   for(count = l; count < total; count += ls)
     {
      uint shift = h * total + count;
      outputs[shift] = exp(inputs[shift]) / (sum + 1e-37f);
     }
  }
//+------------------------------------------------------------------+
//| Derivative of SoftMax activation function                        |
//| Parameter 'outputs' vector of previous FF outputs data           |
//|           'gradients' vector of gradients                        |
//|           'outputs_total' size of outputs vector                 |
//+------------------------------------------------------------------+
__kernel void SoftMaxDerivative(__global TYPE* outputs,
                                __global TYPE* output_gr,
                                __global TYPE* input_gr)
  {
   size_t i = get_global_id(0);
   size_t outputs_total = get_global_size(0);
   size_t shift = get_global_id(1) * outputs_total;
   TYPE output = outputs[shift + i];
   TYPE result = 0;
   for(int j = 0; j < outputs_total; j++)
      result += output * (i == j ? 1 - output : -outputs[shift + j]) * output_gr[shift + j];
   input_gr[shift + i] = result;
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void Split(__global TYPE* source,
                    __global TYPE* target1,
                    __global TYPE* target2,
                    const int total_source,
                    const int total_target1,
                    const int total_target2
                   )
  {
   int n = get_global_id(0);
   int total = get_global_size(0);
   for(int i = n; i < total_source; i += total)
     {
      if(i < total_target1)
         target1[i] = source[i];
      else
        {
         int t2 = i - total_target1;
         if(t2 < total_target2)
            target2[t2] = source[i];
        }
     }
  }
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
__kernel void Concatenate(__global TYPE* source1,
                          __global TYPE* source2,
                          __global TYPE* target,
                          const int total_source1,
                          const int total_source2,
                          const int total_target
                         )
  {
   int n = get_global_id(0);
   int total = get_global_size(0);
   for(int i = n; i < total_target; i += total)
     {
      if(i < total_source1)
         target[i] = source1[i];
      else
        {
         int t2 = i - total_source1;
         if(t2 < total_source2)
            target[i] = source2[t2];
        }
     }
  }
//+------------------------------------------------------------------+