intelligent-trading-bot/scripts/predict.py

162 lines
5.3 KiB
Python
Raw Permalink Normal View History

from pathlib import Path
from datetime import datetime, timezone, timedelta
import click
from tqdm import tqdm
import numpy as np
import pandas as pd
from service.App import *
from common.classifiers import *
from common.feature_generation import *
from common.model_store import *
"""
Apply models to (previously generated) features and compute prediction scores.
"""
#
# Parameters
#
class P:
in_nrows = 100_000_000 # For debugging
tail_rows = 0 # How many last rows to select (for debugging)
@click.command()
@click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
def main(config_file):
load_config(config_file)
time_column = App.config["time_column"]
now = datetime.now()
#
# Load feature matrix
#
symbol = App.config["symbol"]
data_path = Path(App.config["data_folder"]) / symbol
file_path = (data_path / App.config.get("matrix_file_name")).with_suffix(".csv")
if not file_path.is_file():
print(f"ERROR: Input file does not exist: {file_path}")
return
print(f"Loading data from source data file {file_path}...")
df = pd.read_csv(file_path, parse_dates=[time_column], date_format="ISO8601", nrows=P.in_nrows)
print(f"Finished loading {len(df)} records with {len(df.columns)} columns.")
df = df.iloc[-P.tail_rows:]
df = df.reset_index(drop=True)
2023-09-02 11:42:12 +02:00
print(f"Input data size {len(df)} records. Range: [{df.iloc[0][time_column]}, {df.iloc[-1][time_column]}]")
#
# Prepare data by selecting columns and rows
#
train_features = App.config.get("train_features")
labels = App.config["labels"]
algorithms = App.config.get("algorithms")
# Select necessary features and label
out_columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time']
out_columns = [x for x in out_columns if x in df.columns]
labels_present = set(labels).issubset(df.columns)
if labels_present:
all_features = train_features + labels
else:
all_features = train_features
2023-03-11 17:31:16 +01:00
df = df[out_columns + [x for x in all_features if x not in out_columns]]
pd.set_option('use_inf_as_na', True)
#df = df.dropna(subset=labels)
df = df.dropna(subset=train_features)
#in_df = in_df.dropna(subset=labels)
df = df.reset_index(drop=True) # We must reset index after removing rows to remove gaps
train_df = df[train_features]
if len(train_df) == 0:
print(f"ERROR: Empty data set after removing NULLs in feature columns. Some features might have all NULL values.")
#print(train_df.isnull().sum().sort_values(ascending=False))
return
#
# Load models for all score columns
#
2023-03-11 10:05:43 +01:00
model_path = Path(App.config["model_folder"])
if not model_path.is_absolute():
2023-03-11 10:05:43 +01:00
model_path = data_path / model_path
model_path = model_path.resolve()
score_aggregation_sets = App.config['score_aggregation_sets']
all_labels = list(
itertools.chain.from_iterable([x.get("buy_labels") + x.get("sell_labels") for x in score_aggregation_sets]))
2023-03-11 10:12:19 +01:00
models = {label: load_model_pair(model_path, label) for label in all_labels}
#
# Loop over score columns with models and apply them to features
#
scores = dict()
out_df = pd.DataFrame(index=train_df.index) # Collect predictions
for score_column_name, model_pair in tqdm(models.items(), desc="PREDICTIONS"):
label, algo_name = score_to_label_algo_pair(score_column_name)
model_config = get_algorithm(algorithms, algo_name) # Get algorithm description from the algo store
algo_type = model_config.get("algo")
if algo_type == "gb":
df_y_hat = predict_gb(model_pair, train_df, model_config)
elif algo_type == "nn":
df_y_hat = predict_nn(model_pair, train_df, model_config)
elif algo_type == "lc":
df_y_hat = predict_lc(model_pair, train_df, model_config)
elif algo_type == "svc":
df_y_hat = predict_svc(model_pair, train_df, model_config)
else:
raise ValueError(f"Unknown algorithm type '{algo_type}'")
if labels_present:
scores[score_column_name] = compute_scores(df[label], df_y_hat)
out_df[score_column_name] = df_y_hat
#
# Store scores
#
if labels_present:
lines = list()
for score_column_name, score in scores.items():
line = score_column_name + ", " + str(score)
lines.append(line)
metrics_file_name = f"prediction-metrics.txt"
metrics_path = (data_path / metrics_file_name).resolve()
with open(metrics_path, 'a+') as f:
f.write("\n".join(lines) + "\n\n")
print(f"Metrics stored in path: {metrics_path.absolute()}")
#
# Store predictions
#
# Store only selected original data, labels, and their predictions
out_df = out_df.join(df[out_columns + (labels if labels_present else [])])
out_path = data_path / App.config.get("predict_file_name")
print(f"Storing output file...")
out_df.to_csv(out_path.with_suffix(".csv"), index=False, float_format='%.4f')
print(f"Predictions stored in file: {out_path}. Length: {len(out_df)}. Columns: {len(out_df.columns)}")
#
# End
#
elapsed = datetime.now() - now
print(f"Finished training models in {str(elapsed).split('.')[0]}")
if __name__ == '__main__':
main()