intelligent-trading-bot/scripts/grid_search.py

311 lines
12 KiB
Python
Raw Permalink Normal View History

2020-09-13 18:22:25 +02:00
import sys
import os
from datetime import datetime, timezone, timedelta
from typing import Union
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
2021-09-04 18:19:04 +02:00
from service.App import App
from common.utils import *
from common.classifiers import *
2020-09-13 18:22:25 +02:00
"""
Find good hyper-parameters of algorithms applied to generated featues.
2020-09-13 18:22:25 +02:00
"""
data_path = r"C:\DATA2\BITCOIN\GENERATED"
data_file = r"BTCUSDT-1m-features.csv"
2020-12-14 13:10:21 +01:00
labels = App.config["labels"]
features_kline = App.config["features_kline"]
features_futur = App.config["features_futur"]
2020-11-19 19:05:11 +01:00
2020-12-14 20:42:14 +01:00
# features_horizon = 720 # Features are generated using this past window length (max feature window)
labels_horizon = 180 # Labels are generated using this number of steps ahead (max label window)
#
# Parameters of rolling predict
#
nrows = 10_000_000 # For debug
# Columns
train_features = features_kline # features_futur features_kline
predict_label = "high_15"
# Rows
prediction_start_str = "2020-09-01 00:00:00" # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)
#prediction_start_str = "2020-06-01 00:00:00"
train_length = int(1.5 * 525_600) # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur
stride = 4*7*1440 # Length of one rolling prediction step: mid: 1 month 43_800=4*7*1440, long: 1,5 months 6*7*1440
steps = 2 # How many rolling prediction steps. ~40 weeks in [1.2-1.11]
algorithm = "nn" # gb nn lc
#
# Parameters for algorithms
#
2020-09-22 09:36:22 +02:00
params_grid_gb = { # First parameter is the slowest
2020-09-13 18:22:25 +02:00
# binary (logloss - logistic regression) cross_entropy cross_entropy_lambda
"objective": ["cross_entropy"], # "cross_entropy", "cross_entropy_lambda", "binary"
"max_depth": [1],
"learning_rate": [0.01],
"num_boost_round": [1_500],
2020-09-22 09:36:22 +02:00
"lambda_l1": [1.0], # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5
"lambda_l2": [1.0], # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
2020-09-13 18:22:25 +02:00
}
def params_to_line_gb(params):
line = [
params.get("objective"),
params.get("max_depth"),
params.get("learning_rate"),
params.get("num_boost_round"),
params.get("lambda_l1"),
params.get("lambda_l2"),
]
return line
# TODO: Implement and run GB with scaling (by default it uses NO scaling)
# 1-0.01-1500
# Futur
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039
# klines
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006
2020-09-13 18:22:25 +02:00
2020-09-22 09:36:22 +02:00
params_grid_nn = { # First parameter is the slowest
"layers": [[33]], # Number of layers depends on the number of input features
2020-09-22 09:36:22 +02:00
"learning_rate": [0.001],
"n_epochs": [20],
"bs": [64],
}
def params_to_line_nn(params):
line = [
params.get("layers"),
params.get("learning_rate"),
params.get("n_epochs"),
params.get("bs"),
]
return line
# TODO: Implement and run NN without scaling (by default it usese scaling)
# Futur
#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099
#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065
#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033
#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104
#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058
#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031
# Spot/klines
#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169
#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122
#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068
#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181
#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112
#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085
2020-09-22 09:36:22 +02:00
2020-11-19 19:05:11 +01:00
# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100
params_grid_lc = { # First parameter is the slowest
# Best is False, but True can give almost same result (under different conditions)
"is_scale": [False],
"penalty": ["l2"], # "l2" "l1" "elasticnet" "none"
"C": [1.0], # small values stronger regularization
# 1) None balance is always better
"class_weight": [None], # "balanced"
# 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"
"solver": ["liblinear"],
"max_iter": [200],
}
def params_to_line_lc(params):
line = [
params.get("is_scale"),
params.get("penalty"),
params.get("C"),
params.get("class_weight"),
params.get("solver"),
params.get("max_iter"),
]
return line
2020-11-19 19:05:11 +01:00
# Results for futur:
#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016
#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015
#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015
#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009
#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002
#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005
# Results for klines (spot):
#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026
#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012
#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009
#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023
#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008
#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010
2021-10-23 21:51:31 +02:00
2020-09-22 09:36:22 +02:00
#
# Grid search
#
2020-09-13 18:22:25 +02:00
def main():
2020-09-13 18:22:25 +02:00
#
# Load and prepare all data
#
# Load all data
df_all = pd.read_csv(data_path + "\\" + data_file, parse_dates=['timestamp'], date_format="ISO8601", nrows=nrows)
print(f"Feature matrix loaded. Length: {len(df_all)}. Width: {len(df_all.columns)}")
2020-09-13 18:22:25 +02:00
for label in labels:
2020-09-22 09:36:22 +02:00
df_all[label] = df_all[label].astype(int) # "category" NN does not work without this
2020-09-13 18:22:25 +02:00
# Select necessary features and label
df_all = df_all[["timestamp"] + features_kline + features_futur + labels]
2020-09-13 18:22:25 +02:00
# Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
# So we do not drop data here but rather when we select necessary input features
# Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
2020-11-22 20:38:09 +01:00
pd.set_option('use_inf_as_na', True)
#df_all = df_all.dropna(subset=labels)
2020-09-13 18:22:25 +02:00
df_all = df_all.reset_index(drop=True) # We must reset index after removing rows to remove gaps
prediction_start = find_index(df_all, prediction_start_str)
print(f"Start index: {prediction_start}")
2020-09-13 18:22:25 +02:00
2020-11-22 20:38:09 +01:00
if len(df_all) - prediction_start < steps * stride:
raise ValueError(f"Number of steps {steps} is too high (not enough data after start). Data available for prediction: {len(df_all) - prediction_start}. Data to be predicted: {steps * stride} ")
2020-09-13 18:22:25 +02:00
del df_all["timestamp"]
#
# Prepare params and train by getting precision
#
if algorithm == "gb":
params_grid = params_grid_gb
elif algorithm == "nn":
params_grid = params_grid_nn
elif algorithm == "lc":
params_grid = params_grid_lc
else:
raise ValueError(f"Unknown algorithm value {algorithm}.")
2020-12-14 21:48:09 +01:00
metrics = [] # One record with metrics for one param object
2020-09-13 18:22:25 +02:00
grid = ParameterGrid(params_grid)
params_list = list(grid) # List of hyper-param dicts
for i, params in enumerate(params_list):
print("\n{}/{} rolling train start...".format(i+1, len(params_list)))
2020-11-22 20:38:09 +01:00
# Here we will collect true and predicted values for one label
# These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)
2020-09-13 18:22:25 +02:00
y_true = pd.Series(dtype=float)
y_predicted = pd.Series(dtype=float)
2020-11-22 20:38:09 +01:00
2020-09-13 18:22:25 +02:00
for step in range(steps):
2020-09-22 09:36:22 +02:00
2020-11-22 20:38:09 +01:00
print(f"\nStart step {step}/{steps}")
# Predict data
2020-09-13 18:22:25 +02:00
predict_start = prediction_start + (step * stride)
predict_end = predict_start + stride
2020-09-13 18:22:25 +02:00
df_test = df_all.iloc[predict_start:predict_end]
2020-11-22 20:38:09 +01:00
#df_test = df_test.dropna(subset=train_features) # Nans will be droped by the algorithms themselves
2020-09-13 18:22:25 +02:00
df_X_test = df_test[train_features]
df_y_test = df_test[predict_label]
# Train data
# We exclude recent objects from training, because they do not have labels yet - the labels are in future
# In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually
train_end = predict_start - labels_horizon - 1
train_start = train_end - train_length
train_start = 0 if train_start < 0 else train_start
2020-11-22 20:38:09 +01:00
df_train = df_all.iloc[int(train_start):int(train_end)]
df_train = df_train.dropna(subset=train_features)
df_X = df_train[train_features]
df_y = df_train[predict_label]
2020-11-22 20:38:09 +01:00
print(f"Train range: [{train_start}, {train_end}]={train_end-train_start}. Prediction range: [{predict_start}, {predict_end}]={predict_end-predict_start}. ")
2020-09-13 18:22:25 +02:00
# ---
if algorithm == "gb":
y_test_hat = train_predict_gb(df_X, df_y, df_X_test, params)
elif algorithm == "nn":
y_test_hat = train_predict_nn(df_X, df_y, df_X_test, params)
elif algorithm == "lc":
y_test_hat = train_predict_lc(df_X, df_y, df_X_test, params)
2020-09-13 18:22:25 +02:00
# ---
2020-09-13 18:22:25 +02:00
# Append true and predicted array
y_true = y_true.append(df_y_test)
y_predicted = y_predicted.append(y_test_hat)
2020-11-22 20:38:09 +01:00
print(f"End step {step}/{steps}. ")
2020-09-13 18:22:25 +02:00
print("")
print("Finished {} steps of train with {} true and {} predicted results.".format(steps, len(y_true), len(y_predicted)))
2020-11-22 20:38:09 +01:00
# y_true and y_predicted might have nans which can confuse some scoring functions
df_scores = pd.DataFrame({"y_true": y_true, "y_predicted": y_predicted})
num_scores = len(df_scores)
df_scores = df_scores.dropna()
print(f"Total number of collected predictions: {num_scores}. After dropping NaNs: {len(df_scores)}")
print(f"Number of non-NaN predictions used for scoring: {len(df_scores)}")
num_scores = len(df_scores)
y_true = df_scores["y_true"]
y_predicted = df_scores["y_predicted"]
2020-12-14 21:48:09 +01:00
score = compute_scores(y_true, y_predicted)
2020-09-13 18:22:25 +02:00
2020-12-14 21:48:09 +01:00
metrics.append(score)
2020-09-13 18:22:25 +02:00
#
# Process all collected results and save
#
2020-09-22 09:36:22 +02:00
lines = []
for i, params in enumerate(params_list):
line = [predict_label]
# Add parameters
if algorithm == "gb":
line += params_to_line_gb(params)
elif algorithm == "nn":
line += params_to_line_nn(params)
elif algorithm == "lc":
line += params_to_line_lc(params)
# Add scores
2020-12-14 21:48:09 +01:00
rec = metrics[i]
score_str = [
"{:.3f}".format(rec["auc"]),
"{:.3f}".format(rec["f1"]),
"{:.3f}".format(rec["precision"]),
"{:.3f}".format(rec["recall"]),
]
line += score_str
2020-09-13 18:22:25 +02:00
lines.append(", ".join([str(x) for x in line]))
with open('metrics.txt', 'a+') as f:
f.write("\n".join(lines) + "\n")
2020-09-13 18:22:25 +02:00
if __name__ == '__main__':
main()