2022-03-20 10:09:33 +01:00
from pathlib import Path
from datetime import datetime , timezone , timedelta
import click
2022-07-24 11:06:33 +02:00
from tqdm import tqdm
2022-03-20 10:09:33 +01:00
import numpy as np
import pandas as pd
from service . App import *
from common . classifiers import *
from common . feature_generation import *
2022-04-17 11:34:34 +02:00
from common . model_store import *
2022-03-20 10:09:33 +01:00
"""
2022-07-16 11:41:27 +02:00
Train models for all target labels and all algorithms declared in the configuration using the specified features .
2022-03-20 10:09:33 +01:00
"""
#
# Parameters
#
class P :
2022-03-25 22:49:33 +01:00
in_nrows = 100_000_000 # For debugging
2022-07-17 10:04:20 +02:00
tail_rows = 0 # How many last rows to select (for debugging)
2022-03-20 10:09:33 +01:00
# Whether to store file with predictions
2022-03-20 17:11:00 +01:00
store_predictions = True
2022-03-20 10:09:33 +01:00
@click.command ( )
@click.option ( ' --config_file ' , ' -c ' , type = click . Path ( ) , default = ' ' , help = ' Configuration file name ' )
def main ( config_file ) :
load_config ( config_file )
2022-07-16 11:41:27 +02:00
time_column = App . config [ " time_column " ]
2022-03-20 10:09:33 +01:00
2022-07-16 11:41:27 +02:00
now = datetime . now ( )
2022-04-15 21:45:46 +02:00
2022-03-20 10:09:33 +01:00
#
# Load feature matrix
#
2022-07-16 11:41:27 +02:00
symbol = App . config [ " symbol " ]
data_path = Path ( App . config [ " data_folder " ] ) / symbol
2022-03-20 10:09:33 +01:00
2022-07-17 10:04:20 +02:00
file_path = ( data_path / App . config . get ( " matrix_file_name " ) ) . with_suffix ( " .csv " )
2022-07-16 11:41:27 +02:00
if not file_path . is_file ( ) :
print ( f " ERROR: Input file does not exist: { file_path } " )
2022-03-20 10:09:33 +01:00
return
2022-07-16 11:41:27 +02:00
print ( f " Loading data from source data file { file_path } ... " )
df = pd . read_csv ( file_path , parse_dates = [ time_column ] , nrows = P . in_nrows )
print ( f " Finished loading { len ( df ) } records with { len ( df . columns ) } columns. " )
2022-03-20 10:09:33 +01:00
2022-07-16 11:41:27 +02:00
df = df . iloc [ - P . tail_rows : ]
df = df . reset_index ( drop = True )
2022-03-20 10:09:33 +01:00
2022-07-16 11:41:27 +02:00
#
# Prepare data by selecting columns and rows
#
label_horizon = App . config [ " label_horizon " ] # Labels are generated from future data and hence we might want to explicitly remove some tail rows
train_length = App . config . get ( " train_length " )
train_features = App . config . get ( " train_features " )
labels = App . config [ " labels " ]
algorithms = App . config . get ( " algorithms " )
2022-03-20 10:09:33 +01:00
# Select necessary features and label
out_columns = [ ' timestamp ' , ' open ' , ' high ' , ' low ' , ' close ' , ' volume ' , ' close_time ' ]
2022-07-17 10:04:20 +02:00
out_columns = [ x for x in out_columns if x in df . columns ]
2022-07-16 11:41:27 +02:00
all_features = train_features + labels
2022-12-18 10:52:17 +01:00
df = df [ out_columns + [ x for x in all_features if x not in out_columns ] ]
2022-07-16 11:41:27 +02:00
for label in labels :
# "category" NN does not work without this (note that we assume a classification task here)
df [ label ] = df [ label ] . astype ( int )
2022-03-20 10:09:33 +01:00
# Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
# So we do not drop data here but rather when we select necessary input features
# Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
pd . set_option ( ' use_inf_as_na ' , True )
#in_df = in_df.dropna(subset=labels)
2022-07-16 11:41:27 +02:00
df = df . reset_index ( drop = True ) # We must reset index after removing rows to remove gaps
2022-03-20 10:09:33 +01:00
2022-04-02 11:50:07 +02:00
# Remove the tail data for which no labels are available
# The reason is that these labels are computed from future which is not available
if label_horizon :
2022-07-16 11:41:27 +02:00
df = df . head ( - label_horizon )
# Limit maximum length
train_df = df . tail ( train_length )
train_df = train_df . dropna ( subset = train_features )
2022-03-20 10:09:33 +01:00
2022-07-23 09:12:34 +02:00
if len ( train_df ) == 0 :
print ( f " ERROR: Empty data set after removing NULLs in feature columns. Some features might have all NULL values. " )
#print(train_df.isnull().sum().sort_values(ascending=False))
return
2022-03-20 10:09:33 +01:00
models = dict ( )
scores = dict ( )
2022-03-26 19:25:37 +01:00
out_df = pd . DataFrame ( ) # Collect predictions
2022-03-20 10:09:33 +01:00
2022-07-24 11:06:33 +02:00
for label in tqdm ( labels , desc = " LABELS " , colour = ' red ' , position = 0 ) :
2022-07-16 11:41:27 +02:00
2022-07-24 11:06:33 +02:00
for algo_name in tqdm ( algorithms , desc = " ALGORITHMS " , colour = ' red ' , leave = False , position = 1 ) :
2022-07-16 11:41:27 +02:00
model_config = get_model ( algo_name ) # Get algorithm description from the algo store
algo_type = model_config . get ( " algo " )
algo_train_length = model_config . get ( " train " , { } ) . get ( " length " )
2022-08-07 11:13:48 +02:00
score_column_name = label + label_algo_separator + algo_name
2022-07-16 11:41:27 +02:00
# Limit length according to the algorith parameters
if algo_train_length and algo_train_length < train_length :
train_df_2 = train_df . iloc [ - algo_train_length : ]
else :
train_df_2 = train_df
df_X = train_df_2 [ train_features ]
df_y = train_df_2 [ label ]
print ( f " Train ' { score_column_name } ' . Train length { len ( df_X ) } . Train columns { len ( df_X . columns ) } . Algorithm { algo_name } " )
if algo_type == " gb " :
model_pair = train_gb ( df_X , df_y , model_config )
models [ score_column_name ] = model_pair
df_y_hat = predict_gb ( model_pair , df_X , model_config )
elif algo_type == " nn " :
model_pair = train_nn ( df_X , df_y , model_config )
models [ score_column_name ] = model_pair
df_y_hat = predict_nn ( model_pair , df_X , model_config )
elif algo_type == " lc " :
model_pair = train_lc ( df_X , df_y , model_config )
models [ score_column_name ] = model_pair
df_y_hat = predict_lc ( model_pair , df_X , model_config )
2022-08-05 21:27:46 +02:00
elif algo_type == " svc " :
model_pair = train_svc ( df_X , df_y , model_config )
models [ score_column_name ] = model_pair
df_y_hat = predict_svc ( model_pair , df_X , model_config )
2022-07-16 11:41:27 +02:00
else :
print ( f " ERROR: Unknown algorithm type { algo_type } . Check algorithm list. " )
return
scores [ score_column_name ] = compute_scores ( df_y , df_y_hat )
out_df [ score_column_name ] = df_y_hat
2022-03-20 10:09:33 +01:00
#
# Store all collected models in files
#
2023-03-11 10:05:43 +01:00
model_path = Path ( App . config [ " model_folder " ] )
if not model_path . is_absolute ( ) :
model_path = data_path / model_path
model_path = model_path . resolve ( )
2022-08-07 14:02:28 +02:00
model_path . mkdir ( parents = True , exist_ok = True ) # Ensure that folder exists
2022-07-16 11:41:27 +02:00
2022-03-20 10:09:33 +01:00
for score_column_name , model_pair in models . items ( ) :
2022-08-07 14:02:28 +02:00
save_model_pair ( model_path , score_column_name , model_pair )
2022-03-20 10:09:33 +01:00
2022-08-07 14:02:28 +02:00
print ( f " Models stored in path: { model_path . absolute ( ) } " )
2022-03-20 10:09:33 +01:00
#
# Store scores
#
lines = list ( )
for score_column_name , score in scores . items ( ) :
line = score_column_name + " , " + str ( score )
lines . append ( line )
2022-08-07 14:02:28 +02:00
metrics_file_name = f " prediction-metrics.txt "
metrics_path = ( data_path / metrics_file_name ) . resolve ( )
2022-03-25 22:49:33 +01:00
with open ( metrics_path , ' a+ ' ) as f :
2022-08-05 21:27:46 +02:00
f . write ( " \n " . join ( lines ) + " \n \n " )
2022-03-20 10:09:33 +01:00
2022-03-25 22:49:33 +01:00
print ( f " Metrics stored in path: { metrics_path . absolute ( ) } " )
2022-03-20 10:09:33 +01:00
#
# Store predictions if necessary
#
if P . store_predictions :
2022-08-07 14:02:28 +02:00
# Store only selected original data, labels, and their predictions
2022-07-16 11:41:27 +02:00
out_df = out_df . join ( df [ out_columns + labels ] )
2022-04-15 21:45:46 +02:00
2022-07-17 10:04:20 +02:00
out_path = data_path / App . config . get ( " predict_file_name " )
2022-03-20 10:09:33 +01:00
2022-03-26 19:25:37 +01:00
print ( f " Storing output file... " )
2022-08-07 11:13:48 +02:00
out_df . to_csv ( out_path . with_suffix ( " .csv " ) , index = False , float_format = ' %.4f ' )
2022-03-26 19:25:37 +01:00
print ( f " Predictions stored in file: { out_path } . Length: { len ( out_df ) } . Columns: { len ( out_df . columns ) } " )
2022-03-20 10:09:33 +01:00
#
# End
#
2022-07-16 11:41:27 +02:00
elapsed = datetime . now ( ) - now
print ( f " Finished training models in { str ( elapsed ) . split ( ' . ' ) [ 0 ] } " )
2022-03-20 10:09:33 +01:00
if __name__ == ' __main__ ' :
main ( )