oslib/tst/PST/P2_MQL5SVM_7603/MQL5/Include/CSOM/CSOM.mqh

1323 lines
36 KiB
MQL5
Raw Permalink Normal View History

2025-05-30 16:15:18 +02:00
//|------------------------------------------------------------------+
//| CSOM.mqh |
//| Copyright (c) 2018-2019, Marketeer |
//| https://www.mql5.com/en/users/marketeer |
//| https://www.mql5.com/ru/articles/5472/ |
//| https://www.mql5.com/ru/articles/5473/ |
//|------------------------------------------------------------------+
#include <CSOM/CSOMNode.mqh>
#include <Math/Alglib/dataanalysis.mqh>
#define EXTRA_DIMENSIONS 5
#define DIM_HITCOUNT (m_dimension + 0)
#define DIM_UMATRIX (m_dimension + 1)
#define DIM_NODEMSE (m_dimension + 2) // quantization errors per node: average variance (square of standard deviation)
#define DIM_CLUSTERS (m_dimension + 3)
#define DIM_OUTPUT (m_dimension + 4)
#define NBH_SQUARE_SIZE 4
#define NBH_HEXAGONAL_SIZE 6
#define KMEANS_RETRY_NUMBER 10
#define FILE_EXT_SOM ".som"
#define FILE_EXT_CSV ".csv"
const string extras[EXTRA_DIMENSIONS] = {"HitCount", "U-matrix", "Error", "Clusters", "Output"};
class CSOM
{
protected:
// data structure
int m_xcells; // number of map cells (nodes) by x
int m_ycells; // and by y
CSOMNode m_node[]; // array of Kohonen map nodes
int m_nSet; // number of data records
double m_set[]; // array of data records
double m_max[]; // max values in every dimension
double m_min[]; // min values in every dimension
double m_mean[];
double m_sigma[];
int m_dimension; // number of elements (columns) in every record = number of planes in the map
// special additional dimensions are added automatically, such as:
// Population (hit count), U-matrix (distances), Clusters, etc
double m_dataMSE;
bool m_initDone;
bool m_allocated;
double m_map_radius;
double m_time_constant;
double m_learning_rate;
int m_iterations;
string m_titles[];
string m_set_titles[];
bool m_hexCells; // hexagonal cells
string m_sID; // object name prefix - TODO: optional setter needed
double m_clusters[];
string m_labels[];
int m_validationOffset;
ulong m_featureMask;
int m_featureMaskSize;
bool m_reframing;
protected:
virtual bool ReadCSVData(const int h);
virtual int GetBestMatchingIndexNormalized(const double &vector[]) const;
virtual int GetBestMatchingIndex(double &vector[]) const;
virtual void InitNormalization(const bool normalization = true);
virtual bool RemoveOutliers();
virtual void Normalize(double &vector[]) const;
virtual void Denormalize(double &vector[]) const;
virtual double AddPatternStats(const double &vector[], const bool complete = true);
virtual void AnalizePatternStats();
virtual double CalculateStats(const bool complete = true);
virtual bool ResetNodes();
virtual void CalculateDataMSE(const bool complete = true);
public:
CSOM();
~CSOM();
virtual bool Init(const int xc, const int yc, const bool bhex = true);
virtual void Reset();
// Data loading from csv (first line is a header, first column contains records' labels)
bool LoadPatterns(const string filename);
void Shuffle(); // random shuffling of patterns is important when validation is enabled
void AddPattern(const double &vector[], const string title);
bool GetPattern(const int index, double &vector[]) const;
void AssignFeatureTitles(const string &titles[]);
int GetDataCount() const { return m_nSet; }
int GetFeatureCount() const { return m_dimension; }
int GetWidth() const { return m_xcells; }
int GetHeight() const { return m_ycells; }
string GetFeatureTitle(const uint index) const { return index < (uint)ArraySize(m_titles) ? m_titles[index] : NULL; }
string GetPatternTitle(const uint index) const { return index < (uint)ArraySize(m_set_titles) ? m_set_titles[index] : NULL; };
int FindFeature(const string text) const;
void GetFeatureBounds(const uint index, double &min, double &max) const { if(index < (uint)ArraySize(m_max)) { min = m_min[index]; max = m_max[index]; } else { min = max = 0; } }
// Learning
void SetValidationSection(const int splitOffset = 0);
bool SetFeatureMask(const int dim, const ulong bitmask);
double Train(const int epochs, const bool UseNormalization = true, const bool bShowProgress = false);
double TrainAndReframe(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false, const int maxReframes = 10, const int xincrement = 1, const int yincrement = 1);
virtual void Reframe(const int xincrement, const int yincrement); // growing SOM support stub
virtual void CalculateDistances();
virtual void Clusterize(const int clusterNumber);
virtual int Clusterize();
virtual int GetClusterCount() const;
virtual void GetCluster(const int clusterNumber, double &center[]);
int GetSize() const { return ArraySize(m_node); };
CSOMNode *GetNode(const int index) const { return &m_node[index]; };
CSOMNode *GetBestMatchingNode(const double &vector[]) const;
bool GetBestMatchingFeatures(const int node, double &result[]) const;
CSOMNode *GetBestMatchingFeatures(const double &vector[], double &result[]) const;
virtual void CalculateOutput(const double &vector[], const bool normalize = false);
virtual void SetLabel(const int cluster, const string label);
bool Save(const string filename) const;
bool Load(const string filename);
string GetID() const { return m_sID; }
virtual void ProgressUpdate() {};
static string canonic(const string filename, const string ext);
static string timestamp();
};
void CSOM::CSOM()
{
Reset();
}
void CSOM::~CSOM()
{
Reset();
}
void CSOM::Reset()
{
m_initDone = false;
m_allocated = false;
m_sID = NULL;
m_iterations = 100;
m_learning_rate = 0.1;
m_nSet = 0;
m_dimension = 0;
m_xcells = 0;
m_ycells = 0;
m_validationOffset = 0;
m_featureMask = 0;
m_featureMaskSize = 0;
m_reframing = false;
ArrayResize(m_set, 0);
ArrayResize(m_titles, 0);
ArrayResize(m_set_titles, 0);
ArrayResize(m_node, 0);
ArrayResize(m_clusters, 0);
ArrayResize(m_labels, 0);
CSOMNode::SetFeatureMask(0, 0);
}
static string CSOM::canonic(const string filename, const string ext)
{
if(StringFind(filename, ext) != StringLen(filename) - 4)
{
return filename + ext;
}
return filename;
}
static string CSOM::timestamp()
{
MqlDateTime mdt;
TimeLocal(mdt);
return StringFormat("-%04d%02d%02d-%02d%02d%02d", mdt.year, mdt.mon, mdt.day, mdt.hour, mdt.min, mdt.sec);
}
const string SIGNATURE = "MT5SOM.1";
bool CSOM::Save(const string filename) const
{
int h = FileOpen(canonic(filename, FILE_EXT_SOM), FILE_BIN|FILE_WRITE);
if(h != INVALID_HANDLE)
{
FileWriteString(h, SIGNATURE, StringLen(SIGNATURE) + 1);
FileWriteInteger(h, StringLen(m_sID));
FileWriteString(h, m_sID, StringLen(m_sID) + 1);
FileWriteInteger(h, m_xcells);
FileWriteInteger(h, m_ycells);
FileWriteInteger(h, m_hexCells);
FileWriteInteger(h, m_dimension);
for(int i = 0; i < m_dimension; i++)
{
FileWriteInteger(h, StringLen(m_titles[i]));
FileWriteString(h, m_titles[i], StringLen(m_titles[i]) + 1);
}
FileWriteArray(h, m_max);
FileWriteArray(h, m_min);
FileWriteArray(h, m_mean);
FileWriteArray(h, m_sigma);
FileWriteInteger(h, ArraySize(m_clusters));
FileWriteArray(h, m_clusters);
for(int i = 0; i < m_xcells * m_ycells; i++)
{
m_node[i].Save(h);
}
FileClose(h);
Print("Map file ", canonic(filename, FILE_EXT_SOM), " saved");
return true;
}
else
{
Print("FileOpen write failed ", GetLastError());
}
return false;
}
bool CSOM::Load(const string filename)
{
bool result = false;
int h = FileOpen(canonic(filename, FILE_EXT_SOM), FILE_BIN|FILE_READ|FILE_SHARE_READ|FILE_SHARE_WRITE);
if(h != INVALID_HANDLE)
{
string t = FileReadString(h, StringLen(SIGNATURE) + 1);
if(t == SIGNATURE)
{
int n = FileReadInteger(h);
m_sID = FileReadString(h, n + 1);
m_xcells = FileReadInteger(h);
m_ycells = FileReadInteger(h);
m_hexCells = FileReadInteger(h);
m_dimension = FileReadInteger(h);
ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++) m_titles[i] = extras[i - m_dimension];
for(int i = 0; i < m_dimension; i++)
{
int len = FileReadInteger(h);
m_titles[i] = FileReadString(h, len + 1);
}
FileReadArray(h, m_max, 0, m_dimension + EXTRA_DIMENSIONS);
FileReadArray(h, m_min, 0, m_dimension + EXTRA_DIMENSIONS);
FileReadArray(h, m_mean, 0, m_dimension);
FileReadArray(h, m_sigma, 0, m_dimension);
int nc = FileReadInteger(h);
FileReadArray(h, m_clusters, 0, nc);
CSOMNode::FactoryInit(m_dimension);
ResetNodes();
for(int i = 0; i < m_xcells * m_ycells; i++)
{
m_node[i].Load(h);
}
result = true;
Print("Map file ", canonic(filename, FILE_EXT_SOM), " loaded");
m_initDone = result;
m_allocated = result;
}
else
{
Print("Unsupported file format");
}
FileClose(h);
}
else
{
Print("FileOpen read failed: ", canonic(filename, FILE_EXT_SOM), " ", GetLastError());
}
return result;
}
bool CSOM::Init(const int xc, const int yc, const bool bhex = true)
{
if(m_initDone)
{
Print("Warning: The net is already initialized, Init skipped");
return true;
}
m_hexCells = bhex;
m_xcells = xc;
m_ycells = yc;
if(m_sID == NULL) m_sID = "SOM" + timestamp();
m_initDone = true;
CSOMNode::FactoryInit(m_dimension);
return ResetNodes();
}
bool CSOM::ResetNodes()
{
// make sure old objects (if any) are destroyed (invokes destructors)
ArrayResize(m_node, 0);
// allocate node array (every one with the given dimension) (invokes constructors)
if(ArrayResize(m_node, m_xcells * m_ycells) == -1)
{
Print("ArrayResize failed: ", GetLastError());
return false;
}
int ind = 0;
for(int i = 0; i < m_xcells; i++)
{
for(int j = 0; j < m_ycells; j++)
{
m_node[ind++].InitNode(i, j);
}
}
return true;
}
bool CSOM::LoadPatterns(const string filename)
{
string fullname = canonic(filename, FILE_EXT_CSV);
ResetLastError();
int h = FileOpen(fullname, FILE_READ | FILE_ANSI);
if(h == INVALID_HANDLE)
{
Print("FileOpen error ", fullname, " : ", GetLastError());
return(false);
}
Print("FileOpen OK: ", fullname);
bool rez = ReadCSVData(h);
FileClose(h);
m_sID = StringSubstr(fullname, 0, StringLen(fullname) - 4); // use name as prefix for object IDs
return rez;
}
bool CSOM::ReadCSVData(const int h)
{
string line[];
int n = 0;
while(!FileIsEnding(h))
{
string s = FileReadString(h);
if(StringLen(s) <= 0) continue;
n++;
if(n > 1)
{
StringSplit(s, ';', line);
double data[];
int dim = ArraySize(line) - 1;
if(m_dimension != dim)
{
Print("Dimension error in ", n, " line");
return false;
}
ArrayResize(data, dim);
for(int i = 0; i < dim; i++) data[i] = StringToDouble(line[i + 1]);
AddPattern(data, line[0]); // 0-th column is a label
}
else
{
// column names
StringSplit(s, ';', line);
int dim = ArraySize(line) - 1;
if(dim == 0)
{
Print("The format of this CSV-file is not supported, expecting ';' as separator");
return false;
}
if(m_initDone)
{
if(m_dimension > 0 && m_dimension != dim)
{
Print("Dimensions of initilized net and input data do not match each other: ", m_dimension, " vs ", dim);
return false;
}
}
m_dimension = dim;
Print("HEADER: (", m_dimension + 1, ") ", s);
ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
for(int i = 0; i < m_dimension; i++) m_titles[i] = line[i + 1];
for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++) m_titles[i] = extras[i - m_dimension];
}
}
return true;
}
void CSOM::AddPattern(const double &vector[], const string title)
{
m_nSet++;
ArrayResize(m_set, m_dimension * m_nSet);
ArrayResize(m_set_titles, m_nSet);
m_set_titles[m_nSet - 1] = title;
for(int i = 0; i < m_dimension; i++)
{
m_set[m_dimension * (m_nSet - 1) + i] = vector[i];
}
}
bool CSOM::GetPattern(const int index, double &vector[]) const
{
if(index >= 0 && index < m_nSet)
{
ArrayCopy(vector, m_set, 0, index * m_dimension, m_dimension);
return true;
}
return false;
}
void CSOM::AssignFeatureTitles(const string &titles[])
{
m_dimension = ArraySize(titles);
ArrayResize(m_titles, m_dimension + EXTRA_DIMENSIONS);
for(int i = 0; i < m_dimension; i++)
{
m_titles[i] = titles[i];
}
for(int i = m_dimension; i < m_dimension + EXTRA_DIMENSIONS; i++)
{
m_titles[i] = extras[i - m_dimension];
}
}
int CSOM::FindFeature(const string text) const
{
for(int i = 0; i < m_dimension; i++)
{
if(m_titles[i] == text) return i;
}
return -1;
}
bool CSOM::RemoveOutliers()
{
int removed = 0;
int size = ArraySize(m_set);
for(int i = m_nSet - 1; i >= 0; i--)
{
for(int j = 0; j < m_dimension; j++)
{
double v = m_set[m_dimension * i + j];
if(v < m_mean[j] - 3 * m_sigma[j]
|| v > m_mean[j] + 3 * m_sigma[j])
{
#ifdef SOM_VERBOSE
Print("Oulier ", i, " by ", m_titles[j], " removed: ", m_mean[j], ShortToString(0x00B1), m_sigma[j], " ", v);
#endif
#ifdef SOM_OUTLIERS_SOFT
if(v < m_mean[j] - 3 * m_sigma[j])
{
m_set[m_dimension * i + j] = m_mean[j] - 3 * m_sigma[j];
}
else
{
m_set[m_dimension * i + j] = m_mean[j] + 3 * m_sigma[j];
}
removed++;
#else
if(i < m_nSet - 1)
{
int tocopy = m_dimension * (m_nSet - i - 1 - removed);
if(ArrayCopy(m_set, m_set, m_dimension * i, m_dimension * (i + 1), tocopy) != tocopy)
{
Print("ArrayCopy failed, copied elements: ");
}
}
removed++;
break;
#endif
}
}
}
if(removed > 0)
{
#ifdef SOM_OUTLIERS_SOFT
Print("Outliers edited to 3 sigma: ", removed);
#else
if(m_validationOffset > 0)
{
double ratio = m_validationOffset * 1.0 / m_nSet;
m_validationOffset = (int)((m_nSet - removed) * ratio);
}
ArrayResize(m_set, size - removed * m_dimension);
m_nSet -= removed;
Print("Outliers removed: ", removed, ", work vectors left: ", m_nSet, m_validationOffset > 0 ? (", new validation offset: " + (string)m_validationOffset): "");
#endif
}
else
{
Print("No outliers");
}
return removed > 0;
}
void CSOM::InitNormalization(const bool normalization = true)
{
ArrayResize(m_max, m_dimension + EXTRA_DIMENSIONS);
ArrayResize(m_min, m_dimension + EXTRA_DIMENSIONS);
ArrayInitialize(m_max, 0);
ArrayInitialize(m_min, 0);
ArrayResize(m_mean, m_dimension);
ArrayResize(m_sigma, m_dimension);
m_allocated = true;
for(int j = 0; j < m_dimension; j++)
{
double maxv = -DBL_MAX;
double minv = +DBL_MAX;
if(normalization)
{
m_mean[j] = 0;
m_sigma[j] = 0;
}
for(int i = 0; i < m_nSet; i++)
{
double v = m_set[m_dimension * i + j];
if(v > maxv) maxv = v;
if(v < minv) minv = v;
if(normalization)
{
m_mean[j] += v;
m_sigma[j] += v * v;
}
}
m_max[j] = maxv;
m_min[j] = minv;
if(normalization && m_nSet > 0)
{
m_mean[j] /= m_nSet;
m_sigma[j] = MathSqrt(m_sigma[j] / m_nSet - m_mean[j] * m_mean[j]);
}
else
{
m_mean[j] = 0;
m_sigma[j] = 1;
}
#ifdef SOM_VERBOSE
Print(j, " ", m_titles[j], " min=", m_min[j], " max=", m_max[j], " mean=", m_mean[j], " sigma=", m_sigma[j]);
#endif
}
}
bool CSOM::SetFeatureMask(const int dim, const ulong bitmask)
{
if(dim < 0 || dim > m_dimension) return false;
m_featureMask = 0;
m_featureMaskSize = 0;
if(bitmask != 0)
{
m_featureMask = bitmask;
Print("Feature mask enabled:");
for(int i = 0; i < m_dimension; i++)
{
if((bitmask & (1 << i)) != 0)
{
m_featureMaskSize++;
Print(m_titles[i]);
}
}
}
else
{
for(int i = 0; i < dim; i++)
{
m_featureMask |= (1 << i);
}
m_featureMaskSize = dim;
}
CSOMNode::SetFeatureMask(dim == 0 ? m_dimension : dim, bitmask);
return true;
}
void CSOM::SetValidationSection(const int splitOffset = 0)
{
if(splitOffset < 0 || splitOffset >= m_nSet) return;
m_validationOffset = splitOffset;
};
void CSOM::Shuffle()
{
double temp[];
ArrayResize(temp, m_dimension);
string title;
for(int i = 0; i < m_nSet; i++)
{
int ind1 = (int)(1.0 * m_nSet * rand() / 32768);
int ind2 = (int)(1.0 * m_nSet * rand() / 32768);
if(ind1 == ind2) continue;
ArrayCopy(temp, m_set, 0, m_dimension * ind1, m_dimension);
ArrayCopy(m_set, m_set, m_dimension * ind1, m_dimension * ind2, m_dimension);
ArrayCopy(m_set, temp, m_dimension * ind2, 0, m_dimension);
title = m_set_titles[ind1];
m_set_titles[ind1] = m_set_titles[ind2];
m_set_titles[ind2] = title;
}
}
void CSOM::Reframe(const int xincrement, const int yincrement)
{
m_xcells += xincrement;
m_ycells += yincrement;
m_reframing = true;
}
double CSOM::TrainAndReframe(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false, const int maxReframes = 10, const int xincrement = 1, const int yincrement = 1)
{
double nmse = 0;
double nextnmse = DBL_MAX;
m_reframing = false;
for(int i = 0; i < maxReframes; i++)
{
nmse = nextnmse;
ResetNodes();
nextnmse = Train(epochs, bUseNormalization, bShowProgress);
if(nextnmse < nmse)
{
if(i < maxReframes - 1)
{
Reframe(xincrement, yincrement);
}
else
{
Print("Maximum reframe number reached ", maxReframes);
}
}
else
{
Print("Exit map size increments due to increased MSE");
break;
}
}
return nextnmse;
}
double CSOM::Train(const int epochs, const bool bUseNormalization = true, const bool bShowProgress = false)
{
if(bShowProgress) Print("Training ", m_xcells, "*", m_ycells, " ", (m_hexCells ? "hex" : "sqr"), " net starts");
m_iterations = epochs;
int iter = 0; // epoch number
double data[];
ArrayResize(data, m_dimension);
// calculate inital learning radius
m_map_radius = MathMax(m_xcells, m_ycells) / 2.0;
m_time_constant = 1.0 * m_iterations / MathLog(m_map_radius + 1);
#ifdef SOM_VERBOSE
Print("m_time_constant=", m_time_constant);
#endif
InitNormalization(bUseNormalization);
if(bUseNormalization && !m_reframing/* && (m_validationOffset == 0)*/)
{
if(RemoveOutliers())
{
InitNormalization(bUseNormalization); // redo normalization if outliers were removed
}
}
CalculateDataMSE(false); // this is a constant denominator for NMSE
int trainingCount = m_validationOffset > 0 ? m_validationOffset : m_nSet;
if(trainingCount <= 0)
{
Print("No data - no training");
return 0;
}
int total_nodes = ArraySize(m_node);
double nmse = 0;
double nextnmse = 0;
if(m_validationOffset > 0)
{
if(!m_reframing) Shuffle();
nmse = CalculateStats(false);
}
static uint lastTick = GetTickCount();
do
{
double neighbourhood_radius = m_map_radius * MathExp(-1.0 * iter / m_time_constant);
double WS = neighbourhood_radius * neighbourhood_radius;
double learning_rate = m_learning_rate * MathExp(-1.0 * iter / m_iterations); // decrease learning rate exponentially
// one epoch means training on all patterns selected in random order
for(int k = 0; k < trainingCount && !IsStopped(); k++)
{
int ind = (int)(1.0 * trainingCount * rand() / 32768); // choose a record from data set randomly
ArrayCopy(data, m_set, 0, m_dimension * ind, m_dimension);
int winningnode = GetBestMatchingIndex(data); // find a node closest to the record, data is normalized inplace inside
if(winningnode == -1)
{
Print("bad node ", iter, " ", k);
ArrayPrint(data);
}
bool odd = ((winningnode % m_ycells) % 2) == 1;
for(int i = 0; i < total_nodes; i++)
{
bool odd_i = ((i % m_ycells) % 2) == 1;
double shiftx = 0;
if(m_hexCells && odd != odd_i)
{
if(odd && !odd_i)
{
shiftx = +0.5;
}
else // vice versa (!odd && odd_i)
{
shiftx = -0.5;
}
}
// distance from the winner to i-th node
double DistToNodeSqr = (m_node[winningnode].GetX() - (m_node[i].GetX() + shiftx)) * (m_node[winningnode].GetX() - (m_node[i].GetX() + shiftx))
+ (m_node[winningnode].GetY() - m_node[i].GetY()) * (m_node[winningnode].GetY() - m_node[i].GetY());
// the following line speeds up calculation at the expense
// of greater granularity (artifacts) in the spatial distribution of features
// if(DistToNodeSqr < 9 * WS) // it was 1 * WS, which is inappropriate for hexogonal grid
{
double influence = MathExp(-DistToNodeSqr / (2 * WS));
m_node[i].AdjustWeights(data, learning_rate, influence);
}
}
}
if(m_validationOffset > 0 && iter >= m_iterations)
{
static int increaseCount = 0;
nextnmse = CalculateStats(false);
if(nextnmse < nmse)
{
nmse = nextnmse;
increaseCount = 0;
}
else
{
increaseCount++;
if(increaseCount > 1)
{
Print("Exit by validation error at iteration ", iter, "; NMSE[old]=", nmse, ", NMSE[new]=", nextnmse, ", set=", (m_nSet - m_validationOffset));
break;
}
}
}
if(GetTickCount() - lastTick > 1000)
{
lastTick = GetTickCount();
string comment;
if(bShowProgress)
{
StringConcatenate(comment, "Pass ", iter, " from ", m_iterations, " ", (int)(iter * 100.0 / m_iterations), "%", (m_validationOffset > 0 && iter >= m_iterations ? " NMSE=" + (string)nextnmse : ""));
Print(comment);
Comment(comment);
#ifdef SOM_VERBOSE
Print("L=", (float)learning_rate, " R=", (float)neighbourhood_radius);
#endif
ProgressUpdate();
}
}
iter++;
}
while((iter < m_iterations || m_validationOffset > 0) && !IsStopped());
nmse = CalculateStats();
if(bShowProgress)
{
Print("Overall NMSE=", nmse);
string comment;
StringConcatenate(comment, (IsStopped() ? "Training cancelled" : ((m_validationOffset > 0 && iter >= m_iterations) ? "Training stopped by MSE" : "Training completed")), " at pass ", iter, ", NMSE=", nmse);
Comment(comment);
Print(comment);
}
// prepare for visualization
AnalizePatternStats();
return nmse;
}
void CSOM::Normalize(double &vector[]) const
{
for(int k = 0; k < m_dimension; k++)
{
if(m_sigma[k] == 0)
{
/*
static int component = -1;
if(component != k)
{
Print("Sigma is 0 for component ", k, " of ", m_dimension, ", mean=", m_mean[k]);
component = k;
}
*/
}
else
{
vector[k] = (vector[k] - m_mean[k]) / m_sigma[k];
}
}
}
void CSOM::Denormalize(double &vector[]) const
{
for(int k = 0; k < m_dimension; k++)
{
vector[k] = vector[k] * m_sigma[k] + m_mean[k];
}
}
int CSOM::GetBestMatchingIndex(double &vector[]) const
{
Normalize(vector); // vector is mutated due to (optional) normalization
return GetBestMatchingIndexNormalized(vector);
}
CSOMNode *CSOM::GetBestMatchingNode(const double &vector[]) const
{
double data[];
ArrayCopy(data, vector);
int index = GetBestMatchingIndex(data);
if(index > -1)
{
return &m_node[index];
}
return NULL;
};
bool CSOM::GetBestMatchingFeatures(const int node, double &result[]) const
{
if(node < 0 || node > ArraySize(m_node)) return false;
m_node[node].GetCodeVector(result);
Denormalize(result);
return true;
}
CSOMNode *CSOM::GetBestMatchingFeatures(const double &vector[], double &result[]) const
{
CSOMNode *node = GetBestMatchingNode(vector);
if(node != NULL)
{
node.GetCodeVector(result);
Denormalize(result);
}
return node;
}
int CSOM::GetBestMatchingIndexNormalized(const double &vector[]) const
{
int min_ind = -1;
double min_dist = DBL_MAX;
int total_nodes = ArraySize(m_node);
for(int i = 0; i < total_nodes; i++)
{
double d = m_node[i].CalculateDistance(vector);
if(d < min_dist)
{
min_dist = d;
min_ind = i;
}
}
return min_ind;
}
double CSOM::AddPatternStats(const double &data[], const bool complete = true)
{
static double vector[];
ArrayCopy(vector, data);
int ind = GetBestMatchingIndex(vector);
// hits will allow us to calculate average (m) and sigma (s) for every cell in every plane/dimension
// from n training patterns mapped to the cell,
// then ShowPattern can display m, s, n in text marks
// NB. when a cell W is winning, all neighbouring cells N with averages W(N)
// laying inside W(m) +/- W(s) are candidates as well
if(complete) m_node[ind].RegisterPatternHit(vector);
double code[];
m_node[ind].GetCodeVector(code);
Denormalize(code);
double mse = 0;
int dimension = m_featureMask != 0 ? m_featureMaskSize : m_dimension;
for(int i = 0; i < m_dimension; i++)
{
if(m_featureMask == 0 || ((m_featureMask & (1 << i)) != 0))
{
mse += (data[i] - code[i]) * (data[i] - code[i]);
}
}
mse /= dimension;
return mse;
}
template<typename T>
class Neighbourhood
{
protected:
int neighbours[];
int nbhsize;
bool hex;
int m_ycells;
public:
Neighbourhood(const bool _hex, const int ysize)
{
hex = _hex;
m_ycells = ysize;
if(hex)
{
nbhsize = NBH_HEXAGONAL_SIZE;
ArrayResize(neighbours, NBH_HEXAGONAL_SIZE);
neighbours[0] = -1; // up (visually)
neighbours[1] = +1; // down (visually)
neighbours[2] = -m_ycells; // left
neighbours[3] = +m_ycells; // right
/* 4 & 5, applied dynamically in the loop below
// odd row
neighbours[4] = -m_ycells - 1; // left-up
neighbours[5] = -m_ycells + 1; // left-down
// even row
neighbours[4] = +m_ycells - 1; // right-up
neighbours[5] = +m_ycells + 1; // right-down
*/
}
else
{
nbhsize = NBH_SQUARE_SIZE;
ArrayResize(neighbours, NBH_SQUARE_SIZE);
neighbours[0] = -1; // up (visually)
neighbours[1] = +1; // down (visually)
neighbours[2] = -m_ycells; // left
neighbours[3] = +m_ycells; // right
}
}
~Neighbourhood()
{
ArrayResize(neighbours, 0);
}
T loop(const int ind, const CSOMNode &p_node[])
{
int nodes = ArraySize(p_node);
int j = ind % m_ycells;
if(hex)
{
int oddy = ((j % 2) == 1) ? -1 : +1;
neighbours[4] = oddy * m_ycells - 1;
neighbours[5] = oddy * m_ycells + 1;
}
reset();
for(int k = 0; k < nbhsize; k++)
{
if(ind + neighbours[k] >= 0 && ind + neighbours[k] < nodes)
{
// skip wrapping edges
if(j == 0) // upper row
{
if(k == 0 || k == 4) continue;
}
else if(j == m_ycells - 1) // bottom row
{
if(k == 1 || k == 5) continue;
}
iterate(p_node[ind], p_node[ind + neighbours[k]]);
}
}
return getResult();
}
virtual void reset() = 0;
virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) = 0;
virtual T getResult() const = 0;
};
class UMatrixNeighbourhood: public Neighbourhood<double>
{
private:
int n;
double d;
public:
UMatrixNeighbourhood(const bool _hex, const int ysize): Neighbourhood(_hex, ysize)
{
}
virtual void reset() override
{
n = 0;
d = 0.0;
}
virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) override
{
d += node1.CalculateDistance(&node2);
n++;
}
virtual double getResult() const override
{
return d / n;
}
};
class ClusterNeighbourhood: public Neighbourhood<int>
{
private:
int cluster;
double ridge;
public:
ClusterNeighbourhood(const bool _hex, const int ysize): Neighbourhood(_hex, ysize)
{
}
virtual void reset() override
{
cluster = -1;
ridge = DBL_MAX;
}
virtual void iterate(const CSOMNode &node1, const CSOMNode &node2) override
{
int x = node2.GetCluster();
if(x > -1)
{
double y = node1.CalculateDistance(&node2);
if(cluster == -1 || ((x < cluster) && (y < ridge || GlobalVariableCheck("SOM_NO_RIDGE"))))
{
cluster = x;
ridge = y;
}
}
}
virtual int getResult() const override
{
return cluster;
}
};
void CSOM::CalculateDistances()
{
if(!m_allocated) return;
UMatrixNeighbourhood umnh(m_hexCells, m_ycells);
for(int i = 0; i < m_xcells * m_ycells; i++)
{
double d = umnh.loop(i, m_node);
if(d > m_max[DIM_UMATRIX])
{
m_max[DIM_UMATRIX] = d;
}
m_node[i].SetDistance(d);
}
}
void CSOM::CalculateOutput(const double &vector[], const bool normalize = false)
{
if(!m_allocated) return;
double temp[];
ArrayCopy(temp, vector);
if(normalize) Normalize(temp);
m_min[DIM_OUTPUT] = DBL_MAX;
m_max[DIM_OUTPUT] = -DBL_MAX;
for(int i = 0; i < ArraySize(m_node); i++)
{
double x = m_node[i].CalculateOutput(temp);
if(x < m_min[DIM_OUTPUT]) m_min[DIM_OUTPUT] = x;
if(x > m_max[DIM_OUTPUT]) m_max[DIM_OUTPUT] = x;
}
}
int CSOM::GetClusterCount() const
{
return ArraySize(m_clusters) / m_dimension;
}
void CSOM::SetLabel(const int cluster, const string label)
{
int nclusters = ArraySize(m_clusters) / m_dimension;
if(ArraySize(m_labels) != nclusters) ArrayResize(m_labels, nclusters);
if(cluster < nclusters)
{
m_labels[cluster] = label;
}
}
int CSOM::Clusterize()
{
double array[][2];
int n = m_xcells * m_ycells;
ArrayResize(array, n);
for(int i = 0; i < n; i++)
{
if(m_node[i].GetHitsCount() > 0)
{
array[i][0] = m_node[i].GetDistance() * MathSqrt(m_node[i].GetMSE());
}
else
{
array[i][0] = DBL_MAX;
}
array[i][1] = i;
m_node[i].SetCluster(-1);
}
ArraySort(array);
ClusterNeighbourhood clnh(m_hexCells, m_ycells);
int count = 0; // number of clusters
ArrayResize(m_clusters, 0);
for(int i = 0; i < n; i++)
{
// skip if already assigned
if(m_node[(int)array[i][1]].GetCluster() > -1) continue;
// check if current node is adjusent to any existing cluster
int r = clnh.loop((int)array[i][1], m_node);
if(r > -1) // a neighbour belongs to a cluster already
{
m_node[(int)array[i][1]].SetCluster(r);
}
else // we need new cluster
{
ArrayResize(m_clusters, (count + 1) * m_dimension);
double vector[];
m_node[(int)array[i][1]].GetCodeVector(vector);
ArrayCopy(m_clusters, vector, count * m_dimension, 0, m_dimension);
m_node[(int)array[i][1]].SetCluster(count++);
}
}
return count;
}
void CSOM::GetCluster(const int clusterNumber, double &center[])
{
ArrayCopy(center, m_clusters, 0, clusterNumber * m_dimension, m_dimension);
Denormalize(center);
}
void CSOM::Clusterize(const int clusterNumber)
{
int count = m_xcells * m_ycells;
CMatrixDouble xy(count, m_dimension);
int info;
CMatrixDouble clusters;
int membership[];
double weights[];
for(int i = 0; i < count; i++)
{
m_node[i].GetCodeVector(weights);
xy[i] = weights;
}
CKMeans::KMeansGenerate(xy, count, m_dimension, clusterNumber, KMEANS_RETRY_NUMBER, info, clusters, membership);
Print("KMeans result: ", info);
if(info == 1) // ok
{
for(int i = 0; i < m_xcells * m_ycells; i++)
{
m_node[i].SetCluster(membership[i]);
}
#ifdef SOM_VERBOSE
Print("Clusters:");
#endif
ArrayResize(m_clusters, clusterNumber * m_dimension);
for(int j = 0; j < clusterNumber; j++)
{
for(int i = 0; i < m_dimension; i++)
{
m_clusters[j * m_dimension + i] = clusters[i][j];
}
#ifdef SOM_VERBOSE
ArrayPrint(m_clusters, _Digits, ",", j * m_dimension, m_dimension);
#endif
}
}
}
void CSOM::AnalizePatternStats()
{
for(int i = 0; i < EXTRA_DIMENSIONS; i++)
{
m_min[m_dimension + i] = 0;
m_max[m_dimension + i] = 0;
}
for(int i = 0; i < m_xcells * m_ycells; i++)
{
int n = m_node[i].GetHitsCount();
if(n > m_max[DIM_HITCOUNT])
{
m_max[DIM_HITCOUNT] = n;
}
double u = m_node[i].GetMSE();
if(n > 0 && u > 0)
{
if(u > m_max[DIM_NODEMSE])
{
m_max[DIM_NODEMSE] = (double)u;
}
if(u < m_min[DIM_NODEMSE] || m_min[DIM_NODEMSE] == 0)
{
m_min[DIM_NODEMSE] = (double)u;
}
}
u = m_node[i].GetOutput();
if(u > m_max[DIM_OUTPUT])
{
m_max[DIM_OUTPUT] = u;
}
if(u < m_min[DIM_OUTPUT] || m_min[DIM_OUTPUT] == 0)
{
m_min[DIM_OUTPUT] = (double)u;
}
}
}
void CSOM::CalculateDataMSE(const bool complete = true)
{
double data[];
int dimension = m_featureMask != 0 ? m_featureMaskSize : m_dimension;
m_dataMSE = 0.0;
for(int i = complete ? 0 : m_validationOffset; i < m_nSet; i++)
{
ArrayCopy(data, m_set, 0, m_dimension * i, m_dimension);
double mse = 0;
for(int k = 0; k < m_dimension; k++)
{
if(m_featureMask == 0 || ((m_featureMask & (1 << k)) != 0))
{
mse += (data[k] - m_mean[k]) * (data[k] - m_mean[k]);
}
}
mse /= dimension;
m_dataMSE += mse;
}
}
double CSOM::CalculateStats(const bool complete = true)
{
double data[];
ArrayResize(data, m_dimension);
double trainedMSE = 0.0;
for(int i = complete ? 0 : m_validationOffset; i < m_nSet; i++)
{
ArrayCopy(data, m_set, 0, m_dimension * i, m_dimension);
trainedMSE += AddPatternStats(data, complete);
}
if(complete && (m_validationOffset > 0)) CalculateDataMSE(); // update m_dataMSE
const double nmse = trainedMSE / m_dataMSE;
return nmse;
}