mirror of
https://github.com/asavinov/intelligent-trading-bot.git
synced 2026-05-04 16:26:44 +00:00
311 lines
12 KiB
Python
311 lines
12 KiB
Python
import sys
|
|
import os
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Union
|
|
import json
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
from service.App import App
|
|
from common.utils import *
|
|
from common.classifiers import *
|
|
|
|
"""
|
|
Find good hyper-parameters of algorithms applied to generated featues.
|
|
"""
|
|
|
|
data_path = r"C:\DATA2\BITCOIN\GENERATED"
|
|
data_file = r"BTCUSDT-1m-features.csv"
|
|
|
|
labels = App.config["labels"]
|
|
features_kline = App.config["features_kline"]
|
|
features_futur = App.config["features_futur"]
|
|
|
|
# features_horizon = 720 # Features are generated using this past window length (max feature window)
|
|
labels_horizon = 180 # Labels are generated using this number of steps ahead (max label window)
|
|
|
|
#
|
|
# Parameters of rolling predict
|
|
#
|
|
|
|
nrows = 10_000_000 # For debug
|
|
# Columns
|
|
train_features = features_kline # features_futur features_kline
|
|
predict_label = "high_15"
|
|
# Rows
|
|
prediction_start_str = "2020-09-01 00:00:00" # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)
|
|
#prediction_start_str = "2020-06-01 00:00:00"
|
|
train_length = int(1.5 * 525_600) # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur
|
|
stride = 4*7*1440 # Length of one rolling prediction step: mid: 1 month 43_800=4*7*1440, long: 1,5 months 6*7*1440
|
|
steps = 2 # How many rolling prediction steps. ~40 weeks in [1.2-1.11]
|
|
|
|
algorithm = "nn" # gb nn lc
|
|
|
|
#
|
|
# Parameters for algorithms
|
|
#
|
|
|
|
params_grid_gb = { # First parameter is the slowest
|
|
# binary (logloss - logistic regression) cross_entropy cross_entropy_lambda
|
|
"objective": ["cross_entropy"], # "cross_entropy", "cross_entropy_lambda", "binary"
|
|
"max_depth": [1],
|
|
"learning_rate": [0.01],
|
|
"num_boost_round": [1_500],
|
|
|
|
"lambda_l1": [1.0], # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5
|
|
"lambda_l2": [1.0], # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
|
|
}
|
|
def params_to_line_gb(params):
|
|
line = [
|
|
params.get("objective"),
|
|
params.get("max_depth"),
|
|
params.get("learning_rate"),
|
|
params.get("num_boost_round"),
|
|
params.get("lambda_l1"),
|
|
params.get("lambda_l2"),
|
|
]
|
|
return line
|
|
|
|
# TODO: Implement and run GB with scaling (by default it uses NO scaling)
|
|
# 1-0.01-1500
|
|
# Futur
|
|
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097
|
|
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067
|
|
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035
|
|
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073
|
|
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040
|
|
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039
|
|
# klines
|
|
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098
|
|
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040
|
|
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001
|
|
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118
|
|
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038
|
|
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006
|
|
|
|
params_grid_nn = { # First parameter is the slowest
|
|
"layers": [[33]], # Number of layers depends on the number of input features
|
|
"learning_rate": [0.001],
|
|
"n_epochs": [20],
|
|
"bs": [64],
|
|
}
|
|
def params_to_line_nn(params):
|
|
line = [
|
|
params.get("layers"),
|
|
params.get("learning_rate"),
|
|
params.get("n_epochs"),
|
|
params.get("bs"),
|
|
]
|
|
return line
|
|
|
|
# TODO: Implement and run NN without scaling (by default it usese scaling)
|
|
# Futur
|
|
#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099
|
|
#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065
|
|
#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033
|
|
#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104
|
|
#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058
|
|
#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031
|
|
# Spot/klines
|
|
#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169
|
|
#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122
|
|
#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068
|
|
#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181
|
|
#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112
|
|
#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085
|
|
|
|
# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100
|
|
params_grid_lc = { # First parameter is the slowest
|
|
# Best is False, but True can give almost same result (under different conditions)
|
|
"is_scale": [False],
|
|
"penalty": ["l2"], # "l2" "l1" "elasticnet" "none"
|
|
"C": [1.0], # small values stronger regularization
|
|
# 1) None balance is always better
|
|
"class_weight": [None], # "balanced"
|
|
# 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"
|
|
"solver": ["liblinear"],
|
|
"max_iter": [200],
|
|
}
|
|
def params_to_line_lc(params):
|
|
line = [
|
|
params.get("is_scale"),
|
|
params.get("penalty"),
|
|
params.get("C"),
|
|
params.get("class_weight"),
|
|
params.get("solver"),
|
|
params.get("max_iter"),
|
|
]
|
|
return line
|
|
|
|
# Results for futur:
|
|
#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016
|
|
#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015
|
|
#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015
|
|
#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009
|
|
#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002
|
|
#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005
|
|
# Results for klines (spot):
|
|
#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026
|
|
#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012
|
|
#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009
|
|
#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023
|
|
#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008
|
|
#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010
|
|
|
|
|
|
#
|
|
# Grid search
|
|
#
|
|
|
|
def main():
|
|
#
|
|
# Load and prepare all data
|
|
#
|
|
|
|
# Load all data
|
|
df_all = pd.read_csv(data_path + "\\" + data_file, parse_dates=['timestamp'], nrows=nrows)
|
|
|
|
print(f"Feature matrix loaded. Length: {len(df_all)}. Width: {len(df_all.columns)}")
|
|
|
|
for label in labels:
|
|
df_all[label] = df_all[label].astype(int) # "category" NN does not work without this
|
|
|
|
# Select necessary features and label
|
|
df_all = df_all[["timestamp"] + features_kline + features_futur + labels]
|
|
|
|
# Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
|
|
# So we do not drop data here but rather when we select necessary input features
|
|
# Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
|
|
pd.set_option('use_inf_as_na', True)
|
|
#df_all = df_all.dropna(subset=labels)
|
|
df_all = df_all.reset_index(drop=True) # We must reset index after removing rows to remove gaps
|
|
|
|
prediction_start = find_index(df_all, prediction_start_str)
|
|
print(f"Start index: {prediction_start}")
|
|
|
|
if len(df_all) - prediction_start < steps * stride:
|
|
raise ValueError(f"Number of steps {steps} is too high (not enough data after start). Data available for prediction: {len(df_all) - prediction_start}. Data to be predicted: {steps * stride} ")
|
|
|
|
del df_all["timestamp"]
|
|
|
|
#
|
|
# Prepare params and train by getting precision
|
|
#
|
|
if algorithm == "gb":
|
|
params_grid = params_grid_gb
|
|
elif algorithm == "nn":
|
|
params_grid = params_grid_nn
|
|
elif algorithm == "lc":
|
|
params_grid = params_grid_lc
|
|
else:
|
|
raise ValueError(f"Unknown algorithm value {algorithm}.")
|
|
|
|
metrics = [] # One record with metrics for one param object
|
|
grid = ParameterGrid(params_grid)
|
|
params_list = list(grid) # List of hyper-param dicts
|
|
for i, params in enumerate(params_list):
|
|
|
|
print("\n{}/{} rolling train start...".format(i+1, len(params_list)))
|
|
|
|
# Here we will collect true and predicted values for one label
|
|
# These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)
|
|
y_true = pd.Series(dtype=float)
|
|
y_predicted = pd.Series(dtype=float)
|
|
|
|
for step in range(steps):
|
|
|
|
print(f"\nStart step {step}/{steps}")
|
|
|
|
# Predict data
|
|
|
|
predict_start = prediction_start + (step * stride)
|
|
predict_end = predict_start + stride
|
|
|
|
df_test = df_all.iloc[predict_start:predict_end]
|
|
#df_test = df_test.dropna(subset=train_features) # Nans will be droped by the algorithms themselves
|
|
|
|
df_X_test = df_test[train_features]
|
|
df_y_test = df_test[predict_label]
|
|
|
|
# Train data
|
|
|
|
# We exclude recent objects from training, because they do not have labels yet - the labels are in future
|
|
# In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually
|
|
train_end = predict_start - labels_horizon - 1
|
|
train_start = train_end - train_length
|
|
train_start = 0 if train_start < 0 else train_start
|
|
|
|
df_train = df_all.iloc[int(train_start):int(train_end)]
|
|
df_train = df_train.dropna(subset=train_features)
|
|
|
|
df_X = df_train[train_features]
|
|
df_y = df_train[predict_label]
|
|
|
|
print(f"Train range: [{train_start}, {train_end}]={train_end-train_start}. Prediction range: [{predict_start}, {predict_end}]={predict_end-predict_start}. ")
|
|
|
|
# ---
|
|
if algorithm == "gb":
|
|
y_test_hat = train_predict_gb(df_X, df_y, df_X_test, params)
|
|
elif algorithm == "nn":
|
|
y_test_hat = train_predict_nn(df_X, df_y, df_X_test, params)
|
|
elif algorithm == "lc":
|
|
y_test_hat = train_predict_lc(df_X, df_y, df_X_test, params)
|
|
# ---
|
|
|
|
# Append true and predicted array
|
|
y_true = y_true.append(df_y_test)
|
|
y_predicted = y_predicted.append(y_test_hat)
|
|
|
|
print(f"End step {step}/{steps}. ")
|
|
|
|
print("")
|
|
print("Finished {} steps of train with {} true and {} predicted results.".format(steps, len(y_true), len(y_predicted)))
|
|
|
|
# y_true and y_predicted might have nans which can confuse some scoring functions
|
|
df_scores = pd.DataFrame({"y_true": y_true, "y_predicted": y_predicted})
|
|
num_scores = len(df_scores)
|
|
df_scores = df_scores.dropna()
|
|
print(f"Total number of collected predictions: {num_scores}. After dropping NaNs: {len(df_scores)}")
|
|
print(f"Number of non-NaN predictions used for scoring: {len(df_scores)}")
|
|
num_scores = len(df_scores)
|
|
y_true = df_scores["y_true"]
|
|
y_predicted = df_scores["y_predicted"]
|
|
|
|
score = compute_scores(y_true, y_predicted)
|
|
|
|
metrics.append(score)
|
|
|
|
#
|
|
# Process all collected results and save
|
|
#
|
|
lines = []
|
|
for i, params in enumerate(params_list):
|
|
line = [predict_label]
|
|
# Add parameters
|
|
if algorithm == "gb":
|
|
line += params_to_line_gb(params)
|
|
elif algorithm == "nn":
|
|
line += params_to_line_nn(params)
|
|
elif algorithm == "lc":
|
|
line += params_to_line_lc(params)
|
|
# Add scores
|
|
rec = metrics[i]
|
|
score_str = [
|
|
"{:.3f}".format(rec["auc"]),
|
|
"{:.3f}".format(rec["f1"]),
|
|
"{:.3f}".format(rec["precision"]),
|
|
"{:.3f}".format(rec["recall"]),
|
|
]
|
|
line += score_str
|
|
|
|
lines.append(", ".join([str(x) for x in line]))
|
|
|
|
with open('metrics.txt', 'a+') as f:
|
|
f.write("\n".join(lines) + "\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|