2020-09-13 18:22:25 +02:00
import sys
import os
from datetime import datetime , timezone , timedelta
from typing import Union
import json
import numpy as np
import pandas as pd
import matplotlib . pyplot as plt
2021-09-04 18:19:04 +02:00
from service . App import App
2020-11-21 14:16:28 +01:00
from common . utils import *
from common . classifiers import *
2020-09-13 18:22:25 +02:00
"""
2021-01-02 21:25:55 +01:00
Find good hyper - parameters of algorithms applied to generated featues .
2020-09-13 18:22:25 +02:00
"""
data_path = r " C: \ DATA2 \ BITCOIN \ GENERATED "
data_file = r " BTCUSDT-1m-features.csv "
2020-12-14 13:10:21 +01:00
labels = App . config [ " labels " ]
features_kline = App . config [ " features_kline " ]
features_futur = App . config [ " features_futur " ]
2020-11-19 19:05:11 +01:00
2020-12-14 20:42:14 +01:00
# features_horizon = 720 # Features are generated using this past window length (max feature window)
labels_horizon = 180 # Labels are generated using this number of steps ahead (max label window)
2020-11-21 14:16:28 +01:00
#
# Parameters of rolling predict
#
nrows = 10_000_000 # For debug
# Columns
2021-01-02 21:25:55 +01:00
train_features = features_kline # features_futur features_kline
2020-11-22 11:16:39 +01:00
predict_label = " high_15 "
2020-11-21 14:16:28 +01:00
# Rows
2021-01-02 21:25:55 +01:00
prediction_start_str = " 2020-09-01 00:00:00 " # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)
2020-11-21 14:16:28 +01:00
#prediction_start_str = "2020-06-01 00:00:00"
2021-01-02 21:25:55 +01:00
train_length = int ( 1.5 * 525_600 ) # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur
2020-11-22 11:16:39 +01:00
stride = 4 * 7 * 1440 # Length of one rolling prediction step: mid: 1 month 43_800=4*7*1440, long: 1,5 months 6*7*1440
steps = 2 # How many rolling prediction steps. ~40 weeks in [1.2-1.11]
2020-11-21 14:16:28 +01:00
2021-01-02 21:25:55 +01:00
algorithm = " nn " # gb nn lc
2020-11-21 14:16:28 +01:00
#
# Parameters for algorithms
#
2020-09-22 09:36:22 +02:00
params_grid_gb = { # First parameter is the slowest
2020-09-13 18:22:25 +02:00
# binary (logloss - logistic regression) cross_entropy cross_entropy_lambda
" objective " : [ " cross_entropy " ] , # "cross_entropy", "cross_entropy_lambda", "binary"
2020-11-21 14:16:28 +01:00
" max_depth " : [ 1 ] ,
" learning_rate " : [ 0.01 ] ,
" num_boost_round " : [ 1_500 ] ,
2020-09-22 09:36:22 +02:00
" lambda_l1 " : [ 1.0 ] , # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5
" lambda_l2 " : [ 1.0 ] , # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
2020-09-13 18:22:25 +02:00
}
2020-11-21 14:16:28 +01:00
def params_to_line_gb ( params ) :
line = [
params . get ( " objective " ) ,
params . get ( " max_depth " ) ,
params . get ( " learning_rate " ) ,
params . get ( " num_boost_round " ) ,
params . get ( " lambda_l1 " ) ,
params . get ( " lambda_l2 " ) ,
]
return line
# TODO: Implement and run GB with scaling (by default it uses NO scaling)
# 1-0.01-1500
# Futur
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039
# klines
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006
2020-09-13 18:22:25 +02:00
2020-09-22 09:36:22 +02:00
params_grid_nn = { # First parameter is the slowest
2021-01-02 21:25:55 +01:00
" layers " : [ [ 33 ] ] , # Number of layers depends on the number of input features
2020-09-22 09:36:22 +02:00
" learning_rate " : [ 0.001 ] ,
" n_epochs " : [ 20 ] ,
" bs " : [ 64 ] ,
}
2020-11-21 14:16:28 +01:00
def params_to_line_nn ( params ) :
line = [
params . get ( " layers " ) ,
params . get ( " learning_rate " ) ,
params . get ( " n_epochs " ) ,
params . get ( " bs " ) ,
]
return line
# TODO: Implement and run NN without scaling (by default it usese scaling)
# Futur
#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099
#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065
#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033
#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104
#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058
#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031
# Spot/klines
#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169
#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122
#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068
#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181
#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112
#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085
2020-09-22 09:36:22 +02:00
2020-11-19 19:05:11 +01:00
# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100
params_grid_lc = { # First parameter is the slowest
# Best is False, but True can give almost same result (under different conditions)
" is_scale " : [ False ] ,
" penalty " : [ " l2 " ] , # "l2" "l1" "elasticnet" "none"
" C " : [ 1.0 ] , # small values stronger regularization
# 1) None balance is always better
" class_weight " : [ None ] , # "balanced"
# 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"
" solver " : [ " liblinear " ] ,
" max_iter " : [ 200 ] ,
}
2020-11-21 14:16:28 +01:00
def params_to_line_lc ( params ) :
line = [
params . get ( " is_scale " ) ,
params . get ( " penalty " ) ,
params . get ( " C " ) ,
params . get ( " class_weight " ) ,
params . get ( " solver " ) ,
params . get ( " max_iter " ) ,
]
return line
2020-11-19 19:05:11 +01:00
# Results for futur:
#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016
#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015
#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015
#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009
#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002
#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005
# Results for klines (spot):
#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026
#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012
#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009
#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023
#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008
#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010
2021-10-23 21:51:31 +02:00
2020-09-22 09:36:22 +02:00
#
# Grid search
#
2020-09-13 18:22:25 +02:00
2020-11-22 11:16:39 +01:00
def main ( ) :
2020-09-13 18:22:25 +02:00
#
# Load and prepare all data
#
# Load all data
2023-08-24 20:24:11 +02:00
df_all = pd . read_csv ( data_path + " \\ " + data_file , parse_dates = [ ' timestamp ' ] , date_format = " ISO8601 " , nrows = nrows )
2020-11-22 11:16:39 +01:00
print ( f " Feature matrix loaded. Length: { len ( df_all ) } . Width: { len ( df_all . columns ) } " )
2020-09-13 18:22:25 +02:00
for label in labels :
2020-09-22 09:36:22 +02:00
df_all [ label ] = df_all [ label ] . astype ( int ) # "category" NN does not work without this
2020-09-13 18:22:25 +02:00
# Select necessary features and label
2020-11-22 11:16:39 +01:00
df_all = df_all [ [ " timestamp " ] + features_kline + features_futur + labels ]
2020-09-13 18:22:25 +02:00
2020-11-22 11:16:39 +01:00
# Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
# So we do not drop data here but rather when we select necessary input features
# Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
2020-11-22 20:38:09 +01:00
pd . set_option ( ' use_inf_as_na ' , True )
#df_all = df_all.dropna(subset=labels)
2020-09-13 18:22:25 +02:00
df_all = df_all . reset_index ( drop = True ) # We must reset index after removing rows to remove gaps
prediction_start = find_index ( df_all , prediction_start_str )
2020-11-22 11:16:39 +01:00
print ( f " Start index: { prediction_start } " )
2020-09-13 18:22:25 +02:00
2020-11-22 20:38:09 +01:00
if len ( df_all ) - prediction_start < steps * stride :
raise ValueError ( f " Number of steps { steps } is too high (not enough data after start). Data available for prediction: { len ( df_all ) - prediction_start } . Data to be predicted: { steps * stride } " )
2020-09-13 18:22:25 +02:00
del df_all [ " timestamp " ]
#
# Prepare params and train by getting precision
#
2020-11-21 14:16:28 +01:00
if algorithm == " gb " :
params_grid = params_grid_gb
elif algorithm == " nn " :
params_grid = params_grid_nn
elif algorithm == " lc " :
params_grid = params_grid_lc
else :
raise ValueError ( f " Unknown algorithm value { algorithm } . " )
2020-12-14 21:48:09 +01:00
metrics = [ ] # One record with metrics for one param object
2020-09-13 18:22:25 +02:00
grid = ParameterGrid ( params_grid )
params_list = list ( grid ) # List of hyper-param dicts
for i , params in enumerate ( params_list ) :
print ( " \n {} / {} rolling train start... " . format ( i + 1 , len ( params_list ) ) )
2020-11-22 20:38:09 +01:00
# Here we will collect true and predicted values for one label
# These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)
2020-09-13 18:22:25 +02:00
y_true = pd . Series ( dtype = float )
y_predicted = pd . Series ( dtype = float )
2020-11-22 20:38:09 +01:00
2020-09-13 18:22:25 +02:00
for step in range ( steps ) :
2020-09-22 09:36:22 +02:00
2020-11-22 20:38:09 +01:00
print ( f " \n Start step { step } / { steps } " )
2020-11-22 11:16:39 +01:00
# Predict data
2020-09-13 18:22:25 +02:00
2020-11-22 11:16:39 +01:00
predict_start = prediction_start + ( step * stride )
predict_end = predict_start + stride
2020-09-13 18:22:25 +02:00
2020-11-22 11:16:39 +01:00
df_test = df_all . iloc [ predict_start : predict_end ]
2020-11-22 20:38:09 +01:00
#df_test = df_test.dropna(subset=train_features) # Nans will be droped by the algorithms themselves
2020-09-13 18:22:25 +02:00
df_X_test = df_test [ train_features ]
df_y_test = df_test [ predict_label ]
2020-11-22 11:16:39 +01:00
# Train data
# We exclude recent objects from training, because they do not have labels yet - the labels are in future
# In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually
train_end = predict_start - labels_horizon - 1
train_start = train_end - train_length
train_start = 0 if train_start < 0 else train_start
2020-11-22 20:38:09 +01:00
df_train = df_all . iloc [ int ( train_start ) : int ( train_end ) ]
2020-11-22 11:16:39 +01:00
df_train = df_train . dropna ( subset = train_features )
df_X = df_train [ train_features ]
df_y = df_train [ predict_label ]
2020-11-22 20:38:09 +01:00
print ( f " Train range: [ { train_start } , { train_end } ]= { train_end - train_start } . Prediction range: [ { predict_start } , { predict_end } ]= { predict_end - predict_start } . " )
2020-11-22 11:16:39 +01:00
2020-09-13 18:22:25 +02:00
# ---
2020-11-21 14:16:28 +01:00
if algorithm == " gb " :
y_test_hat = train_predict_gb ( df_X , df_y , df_X_test , params )
elif algorithm == " nn " :
y_test_hat = train_predict_nn ( df_X , df_y , df_X_test , params )
elif algorithm == " lc " :
y_test_hat = train_predict_lc ( df_X , df_y , df_X_test , params )
2020-09-13 18:22:25 +02:00
# ---
2020-11-21 14:16:28 +01:00
2020-09-13 18:22:25 +02:00
# Append true and predicted array
y_true = y_true . append ( df_y_test )
y_predicted = y_predicted . append ( y_test_hat )
2020-11-22 20:38:09 +01:00
print ( f " End step { step } / { steps } . " )
2020-09-13 18:22:25 +02:00
print ( " " )
print ( " Finished {} steps of train with {} true and {} predicted results. " . format ( steps , len ( y_true ) , len ( y_predicted ) ) )
2020-11-22 20:38:09 +01:00
# y_true and y_predicted might have nans which can confuse some scoring functions
df_scores = pd . DataFrame ( { " y_true " : y_true , " y_predicted " : y_predicted } )
num_scores = len ( df_scores )
df_scores = df_scores . dropna ( )
print ( f " Total number of collected predictions: { num_scores } . After dropping NaNs: { len ( df_scores ) } " )
print ( f " Number of non-NaN predictions used for scoring: { len ( df_scores ) } " )
num_scores = len ( df_scores )
y_true = df_scores [ " y_true " ]
y_predicted = df_scores [ " y_predicted " ]
2020-12-14 21:48:09 +01:00
score = compute_scores ( y_true , y_predicted )
2020-09-13 18:22:25 +02:00
2020-12-14 21:48:09 +01:00
metrics . append ( score )
2020-09-13 18:22:25 +02:00
#
# Process all collected results and save
#
2020-09-22 09:36:22 +02:00
lines = [ ]
for i , params in enumerate ( params_list ) :
2020-11-21 14:16:28 +01:00
line = [ predict_label ]
# Add parameters
if algorithm == " gb " :
line + = params_to_line_gb ( params )
elif algorithm == " nn " :
line + = params_to_line_nn ( params )
elif algorithm == " lc " :
line + = params_to_line_lc ( params )
# Add scores
2020-12-14 21:48:09 +01:00
rec = metrics [ i ]
2020-11-21 14:16:28 +01:00
score_str = [
" {:.3f} " . format ( rec [ " auc " ] ) ,
" {:.3f} " . format ( rec [ " f1 " ] ) ,
" {:.3f} " . format ( rec [ " precision " ] ) ,
" {:.3f} " . format ( rec [ " recall " ] ) ,
]
line + = score_str
2020-09-13 18:22:25 +02:00
lines . append ( " , " . join ( [ str ( x ) for x in line ] ) )
2020-11-21 14:16:28 +01:00
with open ( ' metrics.txt ' , ' a+ ' ) as f :
f . write ( " \n " . join ( lines ) + " \n " )
2020-09-13 18:22:25 +02:00
if __name__ == ' __main__ ' :
2020-11-22 11:16:39 +01:00
main ( )