intelligent-trading-bot/scripts/grid_search.py

import sys
import os
from datetime import datetime, timezone, timedelta
from typing import Union
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from service.App import App
from common.utils import *
from common.classifiers import *

"""
Find good hyper-parameters of algorithms applied to generated featues.
"""

data_path = r"C:\DATA2\BITCOIN\GENERATED"
data_file = r"BTCUSDT-1m-features.csv"

labels = App.config["labels"]
features_kline = App.config["features_kline"]
features_futur = App.config["features_futur"]

# features_horizon = 720  # Features are generated using this past window length (max feature window)
labels_horizon = 180  # Labels are generated using this number of steps ahead (max label window)

#
# Parameters of rolling predict
#

nrows = 10_000_000  # For debug
# Columns
train_features = features_kline  # features_futur features_kline
predict_label = "high_15"
# Rows
prediction_start_str = "2020-09-01 00:00:00"  # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)
#prediction_start_str = "2020-06-01 00:00:00"
train_length = int(1.5 * 525_600)  # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur
stride = 4*7*1440  # Length of one rolling prediction step: mid: 1 month 43_800=4*7*1440, long: 1,5 months 6*7*1440
steps = 2  # How many rolling prediction steps. ~40 weeks in [1.2-1.11]

algorithm = "nn"  # gb nn lc

#
# Parameters for algorithms
#

params_grid_gb = {  # First parameter is the slowest
    # binary (logloss - logistic regression) cross_entropy cross_entropy_lambda
    "objective": ["cross_entropy"],  # "cross_entropy", "cross_entropy_lambda", "binary"
    "max_depth": [1],
    "learning_rate": [0.01],
    "num_boost_round": [1_500],

    "lambda_l1": [1.0], # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5
    "lambda_l2": [1.0],  # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
}
def params_to_line_gb(params):
    line = [
        params.get("objective"),
        params.get("max_depth"),
        params.get("learning_rate"),
        params.get("num_boost_round"),
        params.get("lambda_l1"),
        params.get("lambda_l2"),
    ]
    return line

# TODO: Implement and run GB with scaling (by default it uses NO scaling)
# 1-0.01-1500
# Futur
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039
# klines
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006

params_grid_nn = {  # First parameter is the slowest
    "layers": [[33]],  # Number of layers depends on the number of input features
    "learning_rate": [0.001],
    "n_epochs": [20],
    "bs": [64],
}
def params_to_line_nn(params):
    line = [
        params.get("layers"),
        params.get("learning_rate"),
        params.get("n_epochs"),
        params.get("bs"),
    ]
    return line

# TODO: Implement and run NN without scaling (by default it usese scaling)
# Futur
#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099
#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065
#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033
#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104
#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058
#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031
# Spot/klines
#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169
#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122
#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068
#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181
#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112
#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085

# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100
params_grid_lc = {  # First parameter is the slowest
    # Best is False, but True can give almost same result (under different conditions)
    "is_scale": [False],
    "penalty": ["l2"],  # "l2" "l1" "elasticnet" "none"
    "C": [1.0],  # small values stronger regularization
    # 1) None balance is always better
    "class_weight": [None],  # "balanced"
    # 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"
    "solver": ["liblinear"],
    "max_iter": [200],
}
def params_to_line_lc(params):
    line = [
        params.get("is_scale"),
        params.get("penalty"),
        params.get("C"),
        params.get("class_weight"),
        params.get("solver"),
        params.get("max_iter"),
    ]
    return line

# Results for futur:
#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016
#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015
#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015
#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009
#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002
#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005
# Results for klines (spot):
#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026
#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012
#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009
#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023
#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008
#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010


#
# Grid search
#

def main():
    #
    # Load and prepare all data
    #

    # Load all data
    df_all = pd.read_csv(data_path + "\\" + data_file, parse_dates=['timestamp'], date_format="ISO8601", nrows=nrows)

    print(f"Feature matrix loaded. Length: {len(df_all)}. Width: {len(df_all.columns)}")

    for label in labels:
        df_all[label] = df_all[label].astype(int)  # "category" NN does not work without this

    # Select necessary features and label
    df_all = df_all[["timestamp"] + features_kline + features_futur + labels]

    # Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
    # So we do not drop data here but rather when we select necessary input features
    # Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
    pd.set_option('use_inf_as_na', True)
    #df_all = df_all.dropna(subset=labels)
    df_all = df_all.reset_index(drop=True)  # We must reset index after removing rows to remove gaps

    prediction_start = find_index(df_all, prediction_start_str)
    print(f"Start index: {prediction_start}")

    if len(df_all) - prediction_start < steps * stride:
        raise ValueError(f"Number of steps {steps} is too high (not enough data after start). Data available for prediction: {len(df_all) - prediction_start}. Data to be predicted: {steps * stride} ")

    del df_all["timestamp"]

    #
    # Prepare params and train by getting precision
    #
    if algorithm == "gb":
        params_grid = params_grid_gb
    elif algorithm == "nn":
        params_grid = params_grid_nn
    elif algorithm == "lc":
        params_grid = params_grid_lc
    else:
        raise ValueError(f"Unknown algorithm value {algorithm}.")

    metrics = []  # One record with metrics for one param object
    grid = ParameterGrid(params_grid)
    params_list = list(grid)  # List of hyper-param dicts
    for i, params in enumerate(params_list):

        print("\n{}/{} rolling train start...".format(i+1, len(params_list)))

        # Here we will collect true and predicted values for one label
        # These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)
        y_true = pd.Series(dtype=float)
        y_predicted = pd.Series(dtype=float)

        for step in range(steps):

            print(f"\nStart step {step}/{steps}")

            # Predict data

            predict_start = prediction_start + (step * stride)
            predict_end = predict_start + stride

            df_test = df_all.iloc[predict_start:predict_end]
            #df_test = df_test.dropna(subset=train_features)  # Nans will be droped by the algorithms themselves

            df_X_test = df_test[train_features]
            df_y_test = df_test[predict_label]

            # Train data

            # We exclude recent objects from training, because they do not have labels yet - the labels are in future
            # In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually
            train_end = predict_start - labels_horizon - 1
            train_start = train_end - train_length
            train_start = 0 if train_start < 0 else train_start

            df_train = df_all.iloc[int(train_start):int(train_end)]
            df_train = df_train.dropna(subset=train_features)

            df_X = df_train[train_features]
            df_y = df_train[predict_label]

            print(f"Train range: [{train_start}, {train_end}]={train_end-train_start}. Prediction range: [{predict_start}, {predict_end}]={predict_end-predict_start}. ")

            # ---
            if algorithm == "gb":
                y_test_hat = train_predict_gb(df_X, df_y, df_X_test, params)
            elif algorithm == "nn":
                y_test_hat = train_predict_nn(df_X, df_y, df_X_test, params)
            elif algorithm == "lc":
                y_test_hat = train_predict_lc(df_X, df_y, df_X_test, params)
            # ---

            # Append true and predicted array
            y_true = y_true.append(df_y_test)
            y_predicted = y_predicted.append(y_test_hat)

            print(f"End step {step}/{steps}. ")

        print("")
        print("Finished {} steps of train with {} true and {} predicted results.".format(steps, len(y_true), len(y_predicted)))

        # y_true and y_predicted might have nans which can confuse some scoring functions
        df_scores = pd.DataFrame({"y_true": y_true, "y_predicted": y_predicted})
        num_scores = len(df_scores)
        df_scores = df_scores.dropna()
        print(f"Total number of collected predictions: {num_scores}. After dropping NaNs: {len(df_scores)}")
        print(f"Number of non-NaN predictions used for scoring: {len(df_scores)}")
        num_scores = len(df_scores)
        y_true = df_scores["y_true"]
        y_predicted = df_scores["y_predicted"]

        score = compute_scores(y_true, y_predicted)

        metrics.append(score)

    #
    # Process all collected results and save
    #
    lines = []
    for i, params in enumerate(params_list):
        line = [predict_label]
        # Add parameters
        if algorithm == "gb":
            line += params_to_line_gb(params)
        elif algorithm == "nn":
            line += params_to_line_nn(params)
        elif algorithm == "lc":
            line += params_to_line_lc(params)
        # Add scores
        rec = metrics[i]
        score_str = [
            "{:.3f}".format(rec["auc"]),
            "{:.3f}".format(rec["f1"]),
            "{:.3f}".format(rec["precision"]),
            "{:.3f}".format(rec["recall"]),
        ]
        line += score_str

        lines.append(", ".join([str(x) for x in line]))

    with open('metrics.txt', 'a+') as f:
        f.write("\n".join(lines) + "\n")


if __name__ == '__main__':
    main()
update 2020-09-13 18:22:25 +02:00			`import sys`
			`import os`
			`from datetime import datetime, timezone, timedelta`
			`from typing import Union`
			`import json`

			`import numpy as np`
			`import pandas as pd`

			`import matplotlib.pyplot as plt`

restructure 2021-09-04 18:19:04 +02:00			`from service.App import App`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`from common.utils import *`
			`from common.classifiers import *`
update 2020-09-13 18:22:25 +02:00
			`"""`
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`Find good hyper-parameters of algorithms applied to generated featues.`
update 2020-09-13 18:22:25 +02:00			`"""`

			`data_path = r"C:\DATA2\BITCOIN\GENERATED"`
			`data_file = r"BTCUSDT-1m-features.csv"`

refactor: move columns lists to App 2020-12-14 13:10:21 +01:00			`labels = App.config["labels"]`
			`features_kline = App.config["features_kline"]`
			`features_futur = App.config["features_futur"]`
add logistic regression 2020-11-19 19:05:11 +01:00
reimplement train_predict_models 2020-12-14 20:42:14 +01:00			`# features_horizon = 720 # Features are generated using this past window length (max feature window)`
			`labels_horizon = 180 # Labels are generated using this number of steps ahead (max label window)`

refactor grid search and common package 2020-11-21 14:16:28 +01:00			`#`
			`# Parameters of rolling predict`
			`#`

			`nrows = 10_000_000 # For debug`
			`# Columns`
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`train_features = features_kline # features_futur features_kline`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`predict_label = "high_15"`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`# Rows`
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`prediction_start_str = "2020-09-01 00:00:00" # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`#prediction_start_str = "2020-06-01 00:00:00"`
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`train_length = int(1.5 * 525_600) # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`stride = 471440 # Length of one rolling prediction step: mid: 1 month 43_800=471440, long: 1,5 months 671440`
			`steps = 2 # How many rolling prediction steps. ~40 weeks in [1.2-1.11]`
refactor grid search and common package 2020-11-21 14:16:28 +01:00
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`algorithm = "nn" # gb nn lc`
refactor grid search and common package 2020-11-21 14:16:28 +01:00
			`#`
			`# Parameters for algorithms`
			`#`

latest changes 2020-09-22 09:36:22 +02:00			`params_grid_gb = { # First parameter is the slowest`
update 2020-09-13 18:22:25 +02:00			`# binary (logloss - logistic regression) cross_entropy cross_entropy_lambda`
			`"objective": ["cross_entropy"], # "cross_entropy", "cross_entropy_lambda", "binary"`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`"max_depth": [1],`
			`"learning_rate": [0.01],`
			`"num_boost_round": [1_500],`
latest changes 2020-09-22 09:36:22 +02:00
			`"lambda_l1": [1.0], # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5`
			`"lambda_l2": [1.0], # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]`
update 2020-09-13 18:22:25 +02:00			`}`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`def params_to_line_gb(params):`
			`line = [`
			`params.get("objective"),`
			`params.get("max_depth"),`
			`params.get("learning_rate"),`
			`params.get("num_boost_round"),`
			`params.get("lambda_l1"),`
			`params.get("lambda_l2"),`
			`]`
			`return line`

			`# TODO: Implement and run GB with scaling (by default it uses NO scaling)`
			`# 1-0.01-1500`
			`# Futur`
			`#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097`
			`#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067`
			`#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035`
			`#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073`
			`#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040`
			`#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039`
			`# klines`
			`#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098`
			`#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040`
			`#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001`
			`#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118`
			`#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038`
			`#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006`
update 2020-09-13 18:22:25 +02:00
latest changes 2020-09-22 09:36:22 +02:00			`params_grid_nn = { # First parameter is the slowest`
introduce linear trend derived features 2021-01-02 21:25:55 +01:00			`"layers": [[33]], # Number of layers depends on the number of input features`
latest changes 2020-09-22 09:36:22 +02:00			`"learning_rate": [0.001],`
			`"n_epochs": [20],`
			`"bs": [64],`
			`}`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`def params_to_line_nn(params):`
			`line = [`
			`params.get("layers"),`
			`params.get("learning_rate"),`
			`params.get("n_epochs"),`
			`params.get("bs"),`
			`]`
			`return line`

			`# TODO: Implement and run NN without scaling (by default it usese scaling)`
			`# Futur`
			`#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099`
			`#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065`
			`#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033`
			`#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104`
			`#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058`
			`#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031`
			`# Spot/klines`
			`#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169`
			`#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122`
			`#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068`
			`#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181`
			`#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112`
			`#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085`
latest changes 2020-09-22 09:36:22 +02:00
add logistic regression 2020-11-19 19:05:11 +01:00			`# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100`
			`params_grid_lc = { # First parameter is the slowest`
			`# Best is False, but True can give almost same result (under different conditions)`
			`"is_scale": [False],`
			`"penalty": ["l2"], # "l2" "l1" "elasticnet" "none"`
			`"C": [1.0], # small values stronger regularization`
			`# 1) None balance is always better`
			`"class_weight": [None], # "balanced"`
			`# 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"`
			`"solver": ["liblinear"],`
			`"max_iter": [200],`
			`}`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`def params_to_line_lc(params):`
			`line = [`
			`params.get("is_scale"),`
			`params.get("penalty"),`
			`params.get("C"),`
			`params.get("class_weight"),`
			`params.get("solver"),`
			`params.get("max_iter"),`
			`]`
			`return line`

add logistic regression 2020-11-19 19:05:11 +01:00			`# Results for futur:`
			`#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016`
			`#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015`
			`#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015`
			`#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009`
			`#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002`
			`#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005`
			`# Results for klines (spot):`
			`#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026`
			`#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012`
			`#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009`
			`#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023`
			`#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008`
			`#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010`

formatting 2021-10-23 21:51:31 +02:00
latest changes 2020-09-22 09:36:22 +02:00			`#`
			`# Grid search`
			`#`
update 2020-09-13 18:22:25 +02:00
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`def main():`
update 2020-09-13 18:22:25 +02:00			`#`
			`# Load and prepare all data`
			`#`

			`# Load all data`
add ISO date format to all date parsers and readers 2023-08-24 20:24:11 +02:00			`df_all = pd.read_csv(data_path + "\\" + data_file, parse_dates=['timestamp'], date_format="ISO8601", nrows=nrows)`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00
			`print(f"Feature matrix loaded. Length: {len(df_all)}. Width: {len(df_all.columns)}")`

update 2020-09-13 18:22:25 +02:00			`for label in labels:`
latest changes 2020-09-22 09:36:22 +02:00			`df_all[label] = df_all[label].astype(int) # "category" NN does not work without this`
update 2020-09-13 18:22:25 +02:00
			`# Select necessary features and label`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`df_all = df_all[["timestamp"] + features_kline + features_futur + labels]`
update 2020-09-13 18:22:25 +02:00
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`# Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)`
			`# So we do not drop data here but rather when we select necessary input features`
			`# Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions`
refactor rolling predictions 2020-11-22 20:38:09 +01:00			`pd.set_option('use_inf_as_na', True)`
			`#df_all = df_all.dropna(subset=labels)`
update 2020-09-13 18:22:25 +02:00			`df_all = df_all.reset_index(drop=True) # We must reset index after removing rows to remove gaps`

			`prediction_start = find_index(df_all, prediction_start_str)`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`print(f"Start index: {prediction_start}")`
update 2020-09-13 18:22:25 +02:00
refactor rolling predictions 2020-11-22 20:38:09 +01:00			`if len(df_all) - prediction_start < steps * stride:`
			`raise ValueError(f"Number of steps {steps} is too high (not enough data after start). Data available for prediction: {len(df_all) - prediction_start}. Data to be predicted: {steps * stride} ")`

update 2020-09-13 18:22:25 +02:00			`del df_all["timestamp"]`

			`#`
			`# Prepare params and train by getting precision`
			`#`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`if algorithm == "gb":`
			`params_grid = params_grid_gb`
			`elif algorithm == "nn":`
			`params_grid = params_grid_nn`
			`elif algorithm == "lc":`
			`params_grid = params_grid_lc`
			`else:`
			`raise ValueError(f"Unknown algorithm value {algorithm}.")`

add metrics to train_predict_models 2020-12-14 21:48:09 +01:00			`metrics = [] # One record with metrics for one param object`
update 2020-09-13 18:22:25 +02:00			`grid = ParameterGrid(params_grid)`
			`params_list = list(grid) # List of hyper-param dicts`
			`for i, params in enumerate(params_list):`

			`print("\n{}/{} rolling train start...".format(i+1, len(params_list)))`

refactor rolling predictions 2020-11-22 20:38:09 +01:00			`# Here we will collect true and predicted values for one label`
			`# These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)`
update 2020-09-13 18:22:25 +02:00			`y_true = pd.Series(dtype=float)`
			`y_predicted = pd.Series(dtype=float)`
refactor rolling predictions 2020-11-22 20:38:09 +01:00
update 2020-09-13 18:22:25 +02:00			`for step in range(steps):`
latest changes 2020-09-22 09:36:22 +02:00
refactor rolling predictions 2020-11-22 20:38:09 +01:00			`print(f"\nStart step {step}/{steps}")`

add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`# Predict data`
update 2020-09-13 18:22:25 +02:00
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`predict_start = prediction_start + (step * stride)`
			`predict_end = predict_start + stride`
update 2020-09-13 18:22:25 +02:00
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`df_test = df_all.iloc[predict_start:predict_end]`
refactor rolling predictions 2020-11-22 20:38:09 +01:00			`#df_test = df_test.dropna(subset=train_features) # Nans will be droped by the algorithms themselves`
update 2020-09-13 18:22:25 +02:00
			`df_X_test = df_test[train_features]`
			`df_y_test = df_test[predict_label]`

add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`# Train data`

			`# We exclude recent objects from training, because they do not have labels yet - the labels are in future`
			`# In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually`
			`train_end = predict_start - labels_horizon - 1`
			`train_start = train_end - train_length`
			`train_start = 0 if train_start < 0 else train_start`

refactor rolling predictions 2020-11-22 20:38:09 +01:00			`df_train = df_all.iloc[int(train_start):int(train_end)]`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`df_train = df_train.dropna(subset=train_features)`

			`df_X = df_train[train_features]`
			`df_y = df_train[predict_label]`

refactor rolling predictions 2020-11-22 20:38:09 +01:00			`print(f"Train range: [{train_start}, {train_end}]={train_end-train_start}. Prediction range: [{predict_start}, {predict_end}]={predict_end-predict_start}. ")`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00
update 2020-09-13 18:22:25 +02:00			`# ---`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`if algorithm == "gb":`
			`y_test_hat = train_predict_gb(df_X, df_y, df_X_test, params)`
			`elif algorithm == "nn":`
			`y_test_hat = train_predict_nn(df_X, df_y, df_X_test, params)`
			`elif algorithm == "lc":`
			`y_test_hat = train_predict_lc(df_X, df_y, df_X_test, params)`
update 2020-09-13 18:22:25 +02:00			`# ---`
refactor grid search and common package 2020-11-21 14:16:28 +01:00
update 2020-09-13 18:22:25 +02:00			`# Append true and predicted array`
			`y_true = y_true.append(df_y_test)`
			`y_predicted = y_predicted.append(y_test_hat)`

refactor rolling predictions 2020-11-22 20:38:09 +01:00			`print(f"End step {step}/{steps}. ")`
update 2020-09-13 18:22:25 +02:00
			`print("")`
			`print("Finished {} steps of train with {} true and {} predicted results.".format(steps, len(y_true), len(y_predicted)))`

refactor rolling predictions 2020-11-22 20:38:09 +01:00			`# y_true and y_predicted might have nans which can confuse some scoring functions`
			`df_scores = pd.DataFrame({"y_true": y_true, "y_predicted": y_predicted})`
			`num_scores = len(df_scores)`
			`df_scores = df_scores.dropna()`
			`print(f"Total number of collected predictions: {num_scores}. After dropping NaNs: {len(df_scores)}")`
			`print(f"Number of non-NaN predictions used for scoring: {len(df_scores)}")`
			`num_scores = len(df_scores)`
			`y_true = df_scores["y_true"]`
			`y_predicted = df_scores["y_predicted"]`

add metrics to train_predict_models 2020-12-14 21:48:09 +01:00			`score = compute_scores(y_true, y_predicted)`
update 2020-09-13 18:22:25 +02:00
add metrics to train_predict_models 2020-12-14 21:48:09 +01:00			`metrics.append(score)`
update 2020-09-13 18:22:25 +02:00
			`#`
			`# Process all collected results and save`
			`#`
latest changes 2020-09-22 09:36:22 +02:00			`lines = []`
			`for i, params in enumerate(params_list):`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`line = [predict_label]`
			`# Add parameters`
			`if algorithm == "gb":`
			`line += params_to_line_gb(params)`
			`elif algorithm == "nn":`
			`line += params_to_line_nn(params)`
			`elif algorithm == "lc":`
			`line += params_to_line_lc(params)`
			`# Add scores`
add metrics to train_predict_models 2020-12-14 21:48:09 +01:00			`rec = metrics[i]`
refactor grid search and common package 2020-11-21 14:16:28 +01:00			`score_str = [`
			`"{:.3f}".format(rec["auc"]),`
			`"{:.3f}".format(rec["f1"]),`
			`"{:.3f}".format(rec["precision"]),`
			`"{:.3f}".format(rec["recall"]),`
			`]`
			`line += score_str`

update 2020-09-13 18:22:25 +02:00			`lines.append(", ".join([str(x) for x in line]))`

refactor grid search and common package 2020-11-21 14:16:28 +01:00			`with open('metrics.txt', 'a+') as f:`
			`f.write("\n".join(lines) + "\n")`
update 2020-09-13 18:22:25 +02:00

			`if __name__ == '__main__':`
add generic nan hnadling in classifiers during prediction 2020-11-22 11:16:39 +01:00			`main()`