oslib/tst/PST/P2_MQL5SVM_7603/MQL5/Include/CSOM/CSOM.mqh

//|------------------------------------------------------------------+
//|                                                         CSOM.mqh |
//|                               Copyright (c) 2018-2019, Marketeer |
//|                          https://www.mql5.com/en/users/marketeer |
//|                           https://www.mql5.com/ru/articles/5472/ |
//|                           https://www.mql5.com/ru/articles/5473/ |
//|------------------------------------------------------------------+

#include <CSOM/CSOMNode.mqh>

#include <Math/Alglib/dataanalysis.mqh>


#define EXTRA_DIMENSIONS 5
#define DIM_HITCOUNT (m_dimension + 0)
#define DIM_UMATRIX  (m_dimension + 1)
#define DIM_NODEMSE  (m_dimension + 2) // quantization errors per node: average variance (square of standard deviation)
#define DIM_CLUSTERS (m_dimension + 3)
#define DIM_OUTPUT   (m_dimension + 4)

#define NBH_SQUARE_SIZE    4
#define NBH_HEXAGONAL_SIZE 6

#define KMEANS_RETRY_NUMBER 10

#define FILE_EXT_SOM ".som"
#define FILE_EXT_CSV ".csv"

const string extras[EXTRA_DIMENSIONS] = {"HitCount", "U-matrix", "Error", "Clusters", "Output"};

class CSOM
{
  protected:
    // data structure
    int m_xcells;      // number of map cells (nodes) by x
    int m_ycells;      // and by y
    CSOMNode m_node[]; // array of Kohonen map nodes
    int m_nSet;        // number of data records
    double m_set[];    // array of data records
    double m_max[];    // max values in every dimension
    double m_min[];    // min values in every dimension
    double m_mean[];
    double m_sigma[];
  
    int m_dimension;   // number of elements (columns) in every record = number of planes in the map
                       // special additional dimensions are added automatically, such as:
                       // Population (hit count), U-matrix (distances), Clusters, etc
    double m_dataMSE;
    bool m_initDone;
    bool m_allocated;
    
    double m_map_radius;
    double m_time_constant;
    double m_learning_rate;
    int m_iterations;
    string m_titles[];
    string m_set_titles[];
    bool m_hexCells;   // hexagonal cells
    string m_sID;      // object name prefix - TODO: optional setter needed

    double m_clusters[];
    string m_labels[];
    
    int m_validationOffset;
    ulong m_featureMask;
    int m_featureMaskSize;
    
    bool m_reframing;
    
  protected:
    virtual bool ReadCSVData(const int h);
    virtual int GetBestMatchingIndexNormalized(const double &vector[]) const;
    virtual int GetBestMatchingIndex(double &vector[]) const;
    virtual void InitNormalization(const bool normalization = true);
    virtual bool RemoveOutliers();
    virtual void Normalize(double &vector[]) const;
    virtual void Denormalize(double &vector[]) const;
    virtual double AddPatternStats(const double &vector[], const bool complete = true);
    virtual void AnalizePatternStats();
    virtual double CalculateStats(const bool complete = true);
    virtual bool ResetNodes();
    virtual void CalculateDataMSE(const bool complete = true);
    
  public:
    CSOM();
    ~CSOM();
    virtual bool Init(const int xc, const int yc, const bool bhex = true);
    virtual void Reset();
  
    // Data loading from csv (first line is a header, first column contains records' labels)
    bool LoadPatterns(const string filename);
    void Shuffle(); // random shuffling of patterns is important when validation is enabled
    void AddPattern(const double &vector[], const string title);
    bool GetPattern(const int index, double &vector[]) const;
    void AssignFeatureTitles(const string &titles[]);
    int GetDataCount() const { return m_nSet; }
    int GetFeatureCount() const { return m_dimension; }
    int GetWidth() const { return m_xcells; }
    int GetHeight() const { return m_ycells; }
    string GetFeatureTitle(const uint index) const { return index < (uint)ArraySize(m_titles) ? m_titles[index] : NULL; }
    string GetPatternTitle(const uint index) const { return index < (uint)ArraySize(m_set_titles) ? m_set_titles[index] : NULL; };
    int FindFeature(const string text) const;
    void GetFeatureBounds(const uint index, double &min, double &max) const { if(index < (uint)ArraySize(m_max)) { min = m_min[index]; max = m_max[index]; } else { min = max = 0; } }
  
    // Learning
    void SetValidationSection(const int splitOffset = 0);
    bool SetFeatureMask(const int dim, const ulong bitmask);
    double Train(const int epochs, const bool UseNormalization = true, const bool bShowProgress = false);
    double TrainAndReframe(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false, const int maxReframes = 10, const int xincrement = 1, const int yincrement = 1);
    virtual void Reframe(const int xincrement, const int yincrement); // growing SOM support stub
    virtual void CalculateDistances();
    virtual void Clusterize(const int clusterNumber);
    virtual int Clusterize();
    virtual int GetClusterCount() const;
    virtual void GetCluster(const int clusterNumber, double &center[]);
  
    int GetSize() const { return ArraySize(m_node); };
    CSOMNode *GetNode(const int index) const { return &m_node[index]; };
    CSOMNode *GetBestMatchingNode(const double &vector[]) const;
    bool GetBestMatchingFeatures(const int node, double &result[]) const;
    CSOMNode *GetBestMatchingFeatures(const double &vector[], double &result[]) const;
    virtual void CalculateOutput(const double &vector[], const bool normalize = false);
    virtual void SetLabel(const int cluster, const string label);
    
    bool Save(const string filename) const;
    bool Load(const string filename);
    string GetID() const { return m_sID; }
    
    virtual void ProgressUpdate() {};
    
    static string canonic(const string filename, const string ext);
    static string timestamp();
    
};


void CSOM::CSOM()
{
  Reset();
}

void CSOM::~CSOM()
{
  Reset();
}

void CSOM::Reset()
{
  m_initDone = false;
  m_allocated = false;
  m_sID = NULL;
  m_iterations = 100;
  m_learning_rate = 0.1;
  m_nSet = 0;
  m_dimension = 0;
  m_xcells = 0;
  m_ycells = 0;
  m_validationOffset = 0;
  m_featureMask = 0;
  m_featureMaskSize = 0;
  m_reframing = false;
  ArrayResize(m_set, 0);
  ArrayResize(m_titles, 0);
  ArrayResize(m_set_titles, 0);
  ArrayResize(m_node, 0);
  ArrayResize(m_clusters, 0);
  ArrayResize(m_labels, 0);
  CSOMNode::SetFeatureMask(0, 0);
}

static string CSOM::canonic(const string filename, const string ext)
{
  if(StringFind(filename, ext) != StringLen(filename) - 4)
  {
    return filename + ext;
  }
  return filename;
}

static string CSOM::timestamp()
{
  MqlDateTime mdt;
  TimeLocal(mdt);
  return StringFormat("-%04d%02d%02d-%02d%02d%02d", mdt.year, mdt.mon, mdt.day, mdt.hour, mdt.min, mdt.sec);
}

const string SIGNATURE = "MT5SOM.1";

bool CSOM::Save(const string filename) const
{
  int h = FileOpen(canonic(filename, FILE_EXT_SOM), FILE_BIN|FILE_WRITE);
  if(h != INVALID_HANDLE)
  {
    FileWriteString(h, SIGNATURE, StringLen(SIGNATURE) + 1);
    FileWriteInteger(h, StringLen(m_sID));
    FileWriteString(h, m_sID, StringLen(m_sID) + 1);
    FileWriteInteger(h, m_xcells);
    FileWriteInteger(h, m_ycells);
    FileWriteInteger(h, m_hexCells);
    FileWriteInteger(h, m_dimension);
    for(int i = 0; i < m_dimension; i++)
    {
      FileWriteInteger(h, StringLen(m_titles[i]));
      FileWriteString(h, m_titles[i], StringLen(m_titles[i]) + 1);
    }

    FileWriteArray(h, m_max);
    FileWriteArray(h, m_min);
    FileWriteArray(h, m_mean);
    FileWriteArray(h, m_sigma);

    FileWriteInteger(h, ArraySize(m_clusters));
    FileWriteArray(h, m_clusters);
    
    for(int i = 0; i < m_xcells * m_ycells; i++)
    {
      m_node[i].Save(h);
    }
    FileClose(h);
    Print("Map file ", canonic(filename, FILE_EXT_SOM), " saved");
    return true;
  }
  else
  {
   Print("FileOpen write failed ", GetLastError());
  }
  return false;
}

bool CSOM::Load(const string filename)
{
  bool result = false;
  int h = FileOpen(canonic(filename, FILE_EXT_SOM), FILE_BIN|FILE_READ|FILE_SHARE_READ|FILE_SHARE_WRITE);
  if(h != INVALID_HANDLE)
  {
    string t = FileReadString(h, StringLen(SIGNATURE) + 1);
    if(t == SIGNATURE)
    {
      int n = FileReadInteger(h);
      m_sID = FileReadString(h, n + 1);
      m_xcells = FileReadInteger(h);
      m_ycells = FileReadInteger(h);
      m_hexCells = FileReadInteger(h);
      m_dimension = FileReadInteger(h);
      ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
      for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++) m_titles[i] = extras[i - m_dimension];
      for(int i = 0; i < m_dimension; i++)
      {
        int len = FileReadInteger(h);
        m_titles[i] = FileReadString(h, len + 1);
      }
      
      FileReadArray(h, m_max, 0, m_dimension + EXTRA_DIMENSIONS);
      FileReadArray(h, m_min, 0, m_dimension + EXTRA_DIMENSIONS);
      
      FileReadArray(h, m_mean, 0, m_dimension);
      FileReadArray(h, m_sigma, 0, m_dimension);

      int nc = FileReadInteger(h);
      FileReadArray(h, m_clusters, 0, nc);
      
      CSOMNode::FactoryInit(m_dimension);
      ResetNodes();
      for(int i = 0; i < m_xcells * m_ycells; i++)
      {
        m_node[i].Load(h);
      }
      result = true;
      Print("Map file ", canonic(filename, FILE_EXT_SOM), " loaded");
      m_initDone = result;
      m_allocated = result;
    }
    else
    {
      Print("Unsupported file format");
    }
    FileClose(h);
  }
  else
  {
   Print("FileOpen read failed: ", canonic(filename, FILE_EXT_SOM), " ", GetLastError());
  }
  return result;
}


bool CSOM::Init(const int xc, const int yc, const bool bhex = true)
{
  if(m_initDone)
  {
    Print("Warning: The net is already initialized, Init skipped");
    return true;
  }
  
  m_hexCells = bhex;
  m_xcells = xc;
  m_ycells = yc;

  if(m_sID == NULL) m_sID = "SOM" + timestamp();

  m_initDone = true;
  
  CSOMNode::FactoryInit(m_dimension);
  return ResetNodes();
}

bool CSOM::ResetNodes()
{
  // make sure old objects (if any) are destroyed (invokes destructors)
  ArrayResize(m_node, 0);
  
  // allocate node array (every one with the given dimension) (invokes constructors)
  if(ArrayResize(m_node, m_xcells * m_ycells) == -1)
  {
    Print("ArrayResize failed: ", GetLastError());
    return false;
  }

  int ind = 0;
  for(int i = 0; i < m_xcells; i++)
  {
    for(int j = 0; j < m_ycells; j++)
    {
      m_node[ind++].InitNode(i, j);
    }
  }

  return true;
}

bool CSOM::LoadPatterns(const string filename)
{
  string fullname = canonic(filename, FILE_EXT_CSV);
  ResetLastError();
  int h = FileOpen(fullname, FILE_READ | FILE_ANSI);
  if(h == INVALID_HANDLE)
  {
    Print("FileOpen error ", fullname, " : ", GetLastError());
    return(false);
  }
  Print("FileOpen OK: ", fullname);
  bool rez = ReadCSVData(h);
  FileClose(h);
  m_sID = StringSubstr(fullname, 0, StringLen(fullname) - 4); // use name as prefix for object IDs
  return rez;
}

bool CSOM::ReadCSVData(const int h)
{
  string line[];
  int n = 0;
  while(!FileIsEnding(h))
  {
    string s = FileReadString(h);
    if(StringLen(s) <= 0) continue;
    n++;
    if(n > 1)
    {
      StringSplit(s, ';', line);
      double data[];
      int dim = ArraySize(line) - 1; 
      if(m_dimension != dim)
      {
        Print("Dimension error in ", n, " line");
        return false;
      }
      ArrayResize(data, dim);
      for(int i = 0; i < dim; i++) data[i] = StringToDouble(line[i + 1]);
      AddPattern(data, line[0]); // 0-th column is a label
    }
    else
    {
      // column names
      StringSplit(s, ';', line);
      int dim = ArraySize(line) - 1;
      if(dim == 0)
      {
        Print("The format of this CSV-file is not supported, expecting ';' as separator");
        return false;
      }
      if(m_initDone)
      {
        if(m_dimension > 0 && m_dimension != dim)
        {
          Print("Dimensions of initilized net and input data do not match each other: ", m_dimension, " vs ", dim);
          return false;
        }
      }
      m_dimension = dim;
      Print("HEADER: (", m_dimension + 1, ") ", s);
      ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
      for(int i = 0; i < m_dimension; i++) m_titles[i] = line[i + 1];
      for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++) m_titles[i] = extras[i - m_dimension];
    }
  }
  return true;
}

void CSOM::AddPattern(const double &vector[], const string title)
{
  m_nSet++;
  ArrayResize(m_set, m_dimension * m_nSet);
  ArrayResize(m_set_titles, m_nSet);
  m_set_titles[m_nSet - 1] = title;
  for(int i = 0; i < m_dimension; i++)
  {
    m_set[m_dimension * (m_nSet - 1) + i] = vector[i];
  }
}

bool CSOM::GetPattern(const int index, double &vector[]) const
{
  if(index >= 0 && index < m_nSet)
  {
    ArrayCopy(vector, m_set, 0, index * m_dimension, m_dimension);
    return true;
  }
  return false;
}

void CSOM::AssignFeatureTitles(const string &titles[])
{
  m_dimension = ArraySize(titles);
  ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
  for(int i = 0; i < m_dimension; i++)
  {
    m_titles[i] = titles[i];
  }
  for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++)
  {
    m_titles[i] = extras[i - m_dimension];
  }
}

int CSOM::FindFeature(const string text) const
{
  for(int i = 0; i < m_dimension; i++)
  {
    if(m_titles[i] == text) return i;
  }
  return -1;
}

bool CSOM::RemoveOutliers()
{
  int removed = 0;
  int size = ArraySize(m_set);

  for(int i = m_nSet - 1; i >= 0; i--)
  {
    for(int j = 0; j < m_dimension; j++)
    {
      double v = m_set[m_dimension * i + j];
      if(v < m_mean[j] - 3 * m_sigma[j]
      || v > m_mean[j] + 3 * m_sigma[j])
      {
#ifdef SOM_VERBOSE
        Print("Oulier ", i, " by ", m_titles[j], " removed: ", m_mean[j], ShortToString(0x00B1), m_sigma[j], " ", v);
#endif
#ifdef SOM_OUTLIERS_SOFT
        if(v < m_mean[j] - 3 * m_sigma[j])
        {
          m_set[m_dimension * i + j] = m_mean[j] - 3 * m_sigma[j];
        }
        else
        {
          m_set[m_dimension * i + j] = m_mean[j] + 3 * m_sigma[j];
        }
        
        removed++;
#else
        if(i < m_nSet - 1)
        {
          int tocopy = m_dimension * (m_nSet - i - 1 - removed);
          if(ArrayCopy(m_set, m_set, m_dimension * i, m_dimension * (i + 1), tocopy) != tocopy)
          {
            Print("ArrayCopy failed, copied elements: ");
          }
        }
        removed++;
        break;
#endif
      }
    }
  }
  if(removed > 0)
  {
#ifdef SOM_OUTLIERS_SOFT
    Print("Outliers edited to 3 sigma: ", removed);
#else
    if(m_validationOffset > 0)
    {
      double ratio = m_validationOffset * 1.0 / m_nSet;
      m_validationOffset = (int)((m_nSet - removed) * ratio);
    }
    ArrayResize(m_set, size - removed * m_dimension);
    m_nSet -= removed;
    Print("Outliers removed: ", removed, ", work vectors left: ", m_nSet, m_validationOffset > 0 ? (", new validation offset: " + (string)m_validationOffset): "");
#endif
  }
  else
  {
    Print("No outliers");
  }

  return removed > 0;
}

void CSOM::InitNormalization(const bool normalization = true)
{
  ArrayResize(m_max, m_dimension + EXTRA_DIMENSIONS);
  ArrayResize(m_min, m_dimension + EXTRA_DIMENSIONS);
  ArrayInitialize(m_max, 0);
  ArrayInitialize(m_min, 0);
  ArrayResize(m_mean, m_dimension);
  ArrayResize(m_sigma, m_dimension);
  m_allocated = true;

  for(int j = 0; j < m_dimension; j++)
  {
    double maxv = -DBL_MAX;
    double minv = +DBL_MAX;
    
    if(normalization)
    {
      m_mean[j] = 0;
      m_sigma[j] = 0;
    }
    
    for(int i = 0; i < m_nSet; i++)
    {
      double v = m_set[m_dimension * i + j];
      if(v > maxv) maxv = v;
      if(v < minv) minv = v;
      if(normalization)
      {
        m_mean[j] += v;
        m_sigma[j] += v * v;
      }
    }
    
    m_max[j] = maxv;
    m_min[j] = minv;
    
    if(normalization && m_nSet > 0)
    {
      m_mean[j] /= m_nSet;
      m_sigma[j] = MathSqrt(m_sigma[j] / m_nSet - m_mean[j] * m_mean[j]);
    }
    else
    {
      m_mean[j] = 0;
      m_sigma[j] = 1;
    }

#ifdef SOM_VERBOSE    
    Print(j, " ", m_titles[j], " min=", m_min[j], " max=", m_max[j], " mean=", m_mean[j], " sigma=", m_sigma[j]);
#endif
  }
}

bool CSOM::SetFeatureMask(const int dim, const ulong bitmask)
{
  if(dim < 0 || dim > m_dimension) return false;
  
  m_featureMask = 0;
  m_featureMaskSize = 0;
  if(bitmask != 0)
  {
    m_featureMask = bitmask;
    Print("Feature mask enabled:");
    for(int i = 0; i < m_dimension; i++)
    {
      if((bitmask & (1 << i)) != 0)
      {
        m_featureMaskSize++;
        Print(m_titles[i]);
      }
    }
  }
  else
  {
    for(int i = 0; i < dim; i++)
    {
      m_featureMask |= (1 << i);
    }
    m_featureMaskSize = dim;
  }
  CSOMNode::SetFeatureMask(dim == 0 ? m_dimension : dim, bitmask);
  return true;
}

void CSOM::SetValidationSection(const int splitOffset = 0)
{
  if(splitOffset < 0 || splitOffset >= m_nSet) return;
  m_validationOffset = splitOffset;
};

void CSOM::Shuffle()
{
  double temp[];
  ArrayResize(temp, m_dimension);
  string title;

  for(int i = 0; i < m_nSet; i++)
  {
    int ind1 = (int)(1.0 * m_nSet * rand() / 32768);
    int ind2 = (int)(1.0 * m_nSet * rand() / 32768);
    
    if(ind1 == ind2) continue;
    
    ArrayCopy(temp, m_set, 0, m_dimension * ind1, m_dimension);
    ArrayCopy(m_set, m_set, m_dimension * ind1, m_dimension * ind2, m_dimension);
    ArrayCopy(m_set, temp, m_dimension * ind2, 0, m_dimension);
    
    title = m_set_titles[ind1];
    m_set_titles[ind1] = m_set_titles[ind2];
    m_set_titles[ind2] = title;
  }
}

void CSOM::Reframe(const int xincrement, const int yincrement)
{
  m_xcells += xincrement;
  m_ycells += yincrement;
  m_reframing = true;
}

double CSOM::TrainAndReframe(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false, const int maxReframes = 10, const int xincrement = 1, const int yincrement = 1)
{
  double nmse = 0;
  double nextnmse = DBL_MAX;
  m_reframing = false;
  for(int i = 0; i < maxReframes; i++)
  {
    nmse = nextnmse;
    ResetNodes();
    nextnmse = Train(epochs, bUseNormalization, bShowProgress);
    if(nextnmse < nmse)
    {
      if(i < maxReframes - 1)
      {
        Reframe(xincrement, yincrement);
      }
      else
      {
        Print("Maximum reframe number reached ", maxReframes);
      }
    }
    else
    {
      Print("Exit map size increments due to increased MSE");
      break;
    }
  }
  return nextnmse;
}

double CSOM::Train(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false)
{
  if(bShowProgress) Print("Training ", m_xcells, "*", m_ycells, " ", (m_hexCells ? "hex" : "sqr"), " net starts");
  
  m_iterations = epochs;

  int iter = 0;  // epoch number
  double data[];
  ArrayResize(data, m_dimension);

  // calculate inital learning radius
  m_map_radius = MathMax(m_xcells, m_ycells) / 2.0;
  m_time_constant = 1.0 * m_iterations / MathLog(m_map_radius + 1);
#ifdef SOM_VERBOSE
  Print("m_time_constant=", m_time_constant);
#endif

  InitNormalization(bUseNormalization);
  if(bUseNormalization && !m_reframing/* && (m_validationOffset == 0)*/)
  {
    if(RemoveOutliers())
    {
      InitNormalization(bUseNormalization); // redo normalization if outliers were removed
    }
  }
  CalculateDataMSE(false); // this is a constant denominator for NMSE

  int trainingCount = m_validationOffset > 0 ? m_validationOffset : m_nSet;
  
  if(trainingCount <= 0)
  {
    Print("No data - no training");
    return 0;
  }
  
  int total_nodes = ArraySize(m_node);
  
  double nmse = 0;
  double nextnmse = 0;
  
  if(m_validationOffset > 0)
  {
    if(!m_reframing) Shuffle();
    nmse = CalculateStats(false);
  }
  
  static uint lastTick = GetTickCount();
  
  do
  {
    double neighbourhood_radius = m_map_radius * MathExp(-1.0 * iter / m_time_constant);
    double WS = neighbourhood_radius * neighbourhood_radius;
    double learning_rate = m_learning_rate * MathExp(-1.0 * iter / m_iterations); // decrease learning rate exponentially

    // one epoch means training on all patterns selected in random order
    for(int k = 0; k < trainingCount && !IsStopped(); k++)
    {
      int ind = (int)(1.0 * trainingCount * rand() / 32768); // choose a record from data set randomly
      
      ArrayCopy(data, m_set, 0, m_dimension * ind, m_dimension);
      int winningnode = GetBestMatchingIndex(data); // find a node closest to the record, data is normalized inplace inside
      if(winningnode == -1)
      {
        Print("bad node ", iter, " ", k);
        ArrayPrint(data);
      }
      bool odd = ((winningnode % m_ycells) % 2) == 1;
      for(int i = 0; i < total_nodes; i++)
      {
        bool odd_i = ((i % m_ycells) % 2) == 1;
        double shiftx = 0;
        
        if(m_hexCells && odd != odd_i)
        {
          if(odd && !odd_i)
          {
            shiftx = +0.5;
          }
          else // vice versa (!odd && odd_i)
          {
            shiftx = -0.5;
          }
        }

        // distance from the winner to i-th node
        double DistToNodeSqr = (m_node[winningnode].GetX() - (m_node[i].GetX() + shiftx)) * (m_node[winningnode].GetX() - (m_node[i].GetX() + shiftx))
                             + (m_node[winningnode].GetY() - m_node[i].GetY()) * (m_node[winningnode].GetY() - m_node[i].GetY());
        
        // the following line speeds up calculation at the expense
        // of greater granularity (artifacts) in the spatial distribution of features
        // if(DistToNodeSqr < 9 * WS) // it was 1 * WS, which is inappropriate for hexogonal grid
        {
          double influence = MathExp(-DistToNodeSqr / (2 * WS));
          m_node[i].AdjustWeights(data, learning_rate, influence);
        }
      }
    }

    
    if(m_validationOffset > 0 && iter >= m_iterations)
    {
      static int increaseCount = 0;
      
      nextnmse = CalculateStats(false);
      if(nextnmse < nmse)
      {
        nmse = nextnmse;
        increaseCount = 0;
      }
      else
      {
        increaseCount++;
        if(increaseCount > 1)
        {
          Print("Exit by validation error at iteration ", iter, "; NMSE[old]=", nmse, ", NMSE[new]=", nextnmse, ", set=", (m_nSet - m_validationOffset));
          break;
        }
      }
    }

    if(GetTickCount() - lastTick > 1000)
    {
      lastTick = GetTickCount();
      string comment;
      if(bShowProgress)
      {
        StringConcatenate(comment, "Pass ", iter, " from ", m_iterations, " ", (int)(iter * 100.0 / m_iterations), "%", (m_validationOffset > 0 && iter >= m_iterations ? " NMSE=" + (string)nextnmse : ""));
        Print(comment);
        Comment(comment);
#ifdef SOM_VERBOSE
        Print("L=", (float)learning_rate, " R=", (float)neighbourhood_radius);
#endif  
        ProgressUpdate();
      }
    }
    
    iter++;
  }
  while((iter < m_iterations || m_validationOffset > 0) && !IsStopped());

  nmse = CalculateStats();
  
  if(bShowProgress)
  {
    Print("Overall NMSE=", nmse);

    string comment;
    StringConcatenate(comment, (IsStopped() ? "Training cancelled" : ((m_validationOffset > 0 && iter >= m_iterations) ? "Training stopped by MSE" : "Training completed")), " at pass ", iter, ", NMSE=", nmse);
    Comment(comment);
    Print(comment);
  }
  
  // prepare for visualization
  AnalizePatternStats();
  
  return nmse;
}

void CSOM::Normalize(double &vector[]) const
{
  for(int k = 0; k < m_dimension; k++)
  {
    if(m_sigma[k] == 0)
    {
      /*
      static int component = -1;
      if(component != k)
      {
        Print("Sigma is 0 for component ", k, " of ", m_dimension, ", mean=", m_mean[k]);
        component = k;
      }
      */
    }
    else
    {
      vector[k] = (vector[k] - m_mean[k]) / m_sigma[k];
    }
  }
}

void CSOM::Denormalize(double &vector[]) const
{
  for(int k = 0; k < m_dimension; k++)
  {
    vector[k] = vector[k] * m_sigma[k] + m_mean[k];
  }
}

int CSOM::GetBestMatchingIndex(double &vector[]) const
{
  Normalize(vector); // vector is mutated due to (optional) normalization
  return GetBestMatchingIndexNormalized(vector);
}

CSOMNode *CSOM::GetBestMatchingNode(const double &vector[]) const
{
  double data[];
  ArrayCopy(data, vector);
  int index = GetBestMatchingIndex(data);
  if(index > -1)
  {
    return &m_node[index];
  }
  return NULL;
};

bool CSOM::GetBestMatchingFeatures(const int node, double &result[]) const
{
  if(node < 0 || node > ArraySize(m_node)) return false;
  m_node[node].GetCodeVector(result);
  Denormalize(result);
  return true;
}

CSOMNode *CSOM::GetBestMatchingFeatures(const double &vector[], double &result[]) const
{
  CSOMNode *node = GetBestMatchingNode(vector);
  if(node != NULL)
  {
    node.GetCodeVector(result);
    Denormalize(result);
  }
  return node;
}

int CSOM::GetBestMatchingIndexNormalized(const double &vector[]) const
{
  int min_ind = -1;
  double min_dist = DBL_MAX;
  int total_nodes = ArraySize(m_node);
  for(int i = 0; i < total_nodes; i++)
  {
    double d = m_node[i].CalculateDistance(vector);
    if(d < min_dist)
    {
      min_dist = d;
      min_ind = i;
    }
  }
  return min_ind;
}

double CSOM::AddPatternStats(const double &data[], const bool complete = true)
{
  static double vector[];
  ArrayCopy(vector, data);
  
  int ind = GetBestMatchingIndex(vector);
  
  // hits will allow us to calculate average (m) and sigma (s) for every cell in every plane/dimension
  // from n training patterns mapped to the cell,
  // then ShowPattern can display m, s, n in text marks
  // NB. when a cell W is winning, all neighbouring cells N with averages W(N)
  // laying inside W(m) +/- W(s) are candidates as well
  if(complete) m_node[ind].RegisterPatternHit(vector);
  
  double code[];
  m_node[ind].GetCodeVector(code);
  Denormalize(code);
  
  double mse = 0;
  int dimension = m_featureMask != 0 ? m_featureMaskSize : m_dimension;
  
  for(int i = 0; i < m_dimension; i++)
  {
    if(m_featureMask == 0 || ((m_featureMask & (1 << i)) != 0))
    {
      mse += (data[i] - code[i]) * (data[i] - code[i]);
    }
  }
  
  mse /= dimension;
  
  return mse;
}

template<typename T>
class Neighbourhood
{
  protected:
    int neighbours[];
    int nbhsize;
    bool hex;
    int m_ycells;

  public:
    Neighbourhood(const bool _hex, const int ysize)
    {
      hex = _hex;
      m_ycells = ysize;

      if(hex)
      {
        nbhsize = NBH_HEXAGONAL_SIZE;
        ArrayResize(neighbours, NBH_HEXAGONAL_SIZE);
        neighbours[0] = -1; // up (visually)
        neighbours[1] = +1; // down (visually)
        neighbours[2] = -m_ycells; // left
        neighbours[3] = +m_ycells; // right
        /* 4 & 5, applied dynamically in the loop below
        // odd row
        neighbours[4] = -m_ycells - 1; // left-up
        neighbours[5] = -m_ycells + 1; // left-down
        // even row
        neighbours[4] = +m_ycells - 1; // right-up
        neighbours[5] = +m_ycells + 1; // right-down
        */
      }
      else
      {
        nbhsize = NBH_SQUARE_SIZE;
        ArrayResize(neighbours, NBH_SQUARE_SIZE);
        neighbours[0] = -1; // up (visually)
        neighbours[1] = +1; // down (visually)
        neighbours[2] = -m_ycells; // left
        neighbours[3] = +m_ycells; // right
      }
    
    }
    ~Neighbourhood()
    {
      ArrayResize(neighbours, 0);
    }

    T loop(const int ind, const CSOMNode &p_node[])
    {
      int nodes = ArraySize(p_node);
      int j = ind % m_ycells;
      
      if(hex)
      {
        int oddy = ((j % 2) == 1) ? -1 : +1;
        neighbours[4] = oddy * m_ycells - 1;
        neighbours[5] = oddy * m_ycells + 1;
      }
      
      reset();

      for(int k = 0; k < nbhsize; k++)
      {
        if(ind + neighbours[k] >= 0 && ind + neighbours[k] < nodes)
        {
          // skip wrapping edges
          if(j == 0) // upper row
          {
            if(k == 0 || k == 4) continue;
          }
          else if(j == m_ycells - 1) // bottom row
          {
            if(k == 1 || k == 5) continue;
          }
          
          iterate(p_node[ind], p_node[ind + neighbours[k]]);
        }
      }
      
      return getResult();
    }
    
    virtual void reset() = 0;
    virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) = 0;
    virtual T getResult() const = 0;
};

class UMatrixNeighbourhood: public Neighbourhood<double>
{
  private:
    int n;
    double d;
    
  public:
    UMatrixNeighbourhood(const bool _hex, const int ysize): Neighbourhood(_hex, ysize)
    {
    }
    
    virtual void reset() override
    {
      n = 0;
      d = 0.0;
    }
    
    virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) override
    {
      d += node1.CalculateDistance(&node2);
      n++;
    }
    
    virtual double getResult() const override
    {
      return d / n;
    }
};

class ClusterNeighbourhood: public Neighbourhood<int>
{
  private:
    int cluster;
    double ridge;

  public:
    ClusterNeighbourhood(const bool _hex, const int ysize): Neighbourhood(_hex, ysize)
    {
    }
    
    virtual void reset() override
    {
      cluster = -1;
      ridge = DBL_MAX;
    }
    
    virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) override
    {
      int x = node2.GetCluster();
      if(x > -1)
      {
        double y = node1.CalculateDistance(&node2);
        if(cluster == -1 || ((x < cluster) && (y < ridge || GlobalVariableCheck("SOM_NO_RIDGE"))))
        {
          cluster = x;
          ridge = y;
        }
      }
    }
    
    virtual int getResult() const override
    {
      return cluster;
    }
};

void CSOM::CalculateDistances()
{
  if(!m_allocated) return;
  
  UMatrixNeighbourhood umnh(m_hexCells, m_ycells);
  
  for(int i = 0; i < m_xcells * m_ycells; i++)
  {
    double d = umnh.loop(i, m_node);
    
    if(d > m_max[DIM_UMATRIX])
    {
      m_max[DIM_UMATRIX] = d;
    }
    
    m_node[i].SetDistance(d);
  }
}

void CSOM::CalculateOutput(const double &vector[], const bool normalize = false)
{
  if(!m_allocated) return;
  
  double temp[];
  ArrayCopy(temp, vector);
  if(normalize) Normalize(temp);
  m_min[DIM_OUTPUT] = DBL_MAX;
  m_max[DIM_OUTPUT] = -DBL_MAX;
  for(int i = 0; i < ArraySize(m_node); i++)
  {
    double x = m_node[i].CalculateOutput(temp);
    if(x < m_min[DIM_OUTPUT]) m_min[DIM_OUTPUT] = x;
    if(x > m_max[DIM_OUTPUT]) m_max[DIM_OUTPUT] = x;
  }
}

int CSOM::GetClusterCount() const
{
  return ArraySize(m_clusters) / m_dimension;
}

void CSOM::SetLabel(const int cluster, const string label)
{
  int nclusters = ArraySize(m_clusters) / m_dimension;
  if(ArraySize(m_labels) != nclusters) ArrayResize(m_labels, nclusters);
  if(cluster < nclusters)
  {
    m_labels[cluster] = label;
  }
}

int CSOM::Clusterize()
{
  double array[][2];
  int n = m_xcells * m_ycells;
  ArrayResize(array, n);
  for(int i = 0; i < n; i++)
  {
    if(m_node[i].GetHitsCount() > 0)
    {
      array[i][0] = m_node[i].GetDistance() * MathSqrt(m_node[i].GetMSE());
    }
    else
    {
      array[i][0] = DBL_MAX;
    }
    array[i][1] = i;
    m_node[i].SetCluster(-1);
  }
  ArraySort(array);
  
  ClusterNeighbourhood clnh(m_hexCells, m_ycells);

  int count = 0; // number of clusters
  ArrayResize(m_clusters, 0);
  
  for(int i = 0; i < n; i++)
  {
    // skip if already assigned
    if(m_node[(int)array[i][1]].GetCluster() > -1) continue;
    
    // check if current node is adjusent to any existing cluster
    int r = clnh.loop((int)array[i][1], m_node);
    if(r > -1) // a neighbour belongs to a cluster already
    {
      m_node[(int)array[i][1]].SetCluster(r);
    }
    else // we need new cluster
    {
      ArrayResize(m_clusters, (count + 1) * m_dimension);
      
      double vector[];
      m_node[(int)array[i][1]].GetCodeVector(vector);
      ArrayCopy(m_clusters, vector, count * m_dimension, 0, m_dimension);
      
      m_node[(int)array[i][1]].SetCluster(count++);
    }
  }
  return count;
}

void CSOM::GetCluster(const int clusterNumber, double &center[])
{
  ArrayCopy(center, m_clusters, 0, clusterNumber * m_dimension, m_dimension);
  Denormalize(center);
}

void CSOM::Clusterize(const int clusterNumber)
{
  int count = m_xcells * m_ycells;
  CMatrixDouble xy(count, m_dimension);
  int info;
  CMatrixDouble clusters;
  int membership[];
  double weights[];
  
  for(int i = 0; i < count; i++)
  {
    m_node[i].GetCodeVector(weights);
    xy[i] = weights;
  }

  CKMeans::KMeansGenerate(xy, count, m_dimension, clusterNumber, KMEANS_RETRY_NUMBER, info, clusters, membership);
  Print("KMeans result: ", info);
  if(info == 1) // ok
  {
    for(int i = 0; i < m_xcells * m_ycells; i++)
    {
      m_node[i].SetCluster(membership[i]);
    }
    
#ifdef SOM_VERBOSE
    Print("Clusters:");
#endif
    ArrayResize(m_clusters, clusterNumber * m_dimension);
    for(int j = 0; j < clusterNumber; j++)
    {
      for(int i = 0; i < m_dimension; i++)
      {
        m_clusters[j * m_dimension + i] = clusters[i][j];
      }
      
#ifdef SOM_VERBOSE
      ArrayPrint(m_clusters, _Digits, ",", j * m_dimension, m_dimension);
#endif
    }
  }
}

void CSOM::AnalizePatternStats()
{
  for(int i = 0; i < EXTRA_DIMENSIONS; i++)
  {
    m_min[m_dimension + i] = 0;
    m_max[m_dimension + i] = 0;
  }
  
  for(int i = 0; i < m_xcells * m_ycells; i++)
  {
    int n = m_node[i].GetHitsCount();
    if(n > m_max[DIM_HITCOUNT])
    {
      m_max[DIM_HITCOUNT] = n;
    }
    
    double u = m_node[i].GetMSE();
    if(n > 0 && u > 0)
    {
      if(u > m_max[DIM_NODEMSE])
      {
        m_max[DIM_NODEMSE] = (double)u;
      }
      if(u < m_min[DIM_NODEMSE] || m_min[DIM_NODEMSE] == 0)
      {
        m_min[DIM_NODEMSE] = (double)u;
      }
    }
    
    u = m_node[i].GetOutput();
    if(u > m_max[DIM_OUTPUT])
    {
      m_max[DIM_OUTPUT] = u;
    }
    if(u < m_min[DIM_OUTPUT] || m_min[DIM_OUTPUT] == 0)
    {
      m_min[DIM_OUTPUT] = (double)u;
    }
  }
}


void CSOM::CalculateDataMSE(const bool complete = true)
{
  double data[];
  int dimension = m_featureMask != 0 ? m_featureMaskSize : m_dimension;

  m_dataMSE = 0.0;
  
  for(int i = complete ? 0 : m_validationOffset; i < m_nSet; i++)
  {
    ArrayCopy(data, m_set, 0, m_dimension * i, m_dimension);

    double mse = 0;
    for(int k = 0; k < m_dimension; k++)
    {
      if(m_featureMask == 0 || ((m_featureMask & (1 << k)) != 0))
      {
        mse += (data[k] - m_mean[k]) * (data[k] - m_mean[k]);
      }
    }
    
    mse /= dimension;
    m_dataMSE += mse;
  }
}

double CSOM::CalculateStats(const bool complete = true)
{
  double data[];
  ArrayResize(data, m_dimension);
  double trainedMSE = 0.0;
  
  for(int i = complete ? 0 : m_validationOffset; i < m_nSet; i++)
  {
    ArrayCopy(data, m_set, 0, m_dimension * i, m_dimension);
    trainedMSE += AddPatternStats(data, complete);
  }
  
  if(complete && (m_validationOffset > 0)) CalculateDataMSE(); // update m_dataMSE
  
  const double nmse = trainedMSE / m_dataMSE;

  return nmse;
}