intelligent-trading-bot/scripts/generate_features.py

111 lines
3.3 KiB
Python
Raw Permalink Normal View History

2020-02-23 20:45:50 +01:00
import sys
from pathlib import Path
from datetime import datetime, timezone, timedelta
from typing import Union
import json
import pickle
2021-09-09 20:48:11 +02:00
import click
2020-02-23 20:45:50 +01:00
import numpy as np
import pandas as pd
2021-09-09 20:48:11 +02:00
from service.App import *
2020-12-13 19:30:06 +01:00
from common.feature_generation import *
from common.label_generation import *
2020-02-23 20:45:50 +01:00
#
# Parameters
#
class P:
feature_sets = ["kline", ] # "futur"
2020-05-09 20:14:22 +02:00
in_nrows = 10_000_000
2020-02-23 20:45:50 +01:00
2021-09-09 20:48:11 +02:00
@click.command()
@click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
def main(config_file):
load_config(config_file)
2020-02-23 20:45:50 +01:00
2021-09-09 20:48:11 +02:00
freq = "1m"
symbol = App.config["symbol"]
data_path = Path(App.config["data_folder"])
if not data_path.is_dir():
print(f"Data folder does not exist: {data_path}")
return
2020-02-23 20:45:50 +01:00
start_dt = datetime.now()
#
# Load historic data
#
2021-09-09 20:48:11 +02:00
in_path = (data_path / f"{symbol}-{freq}.csv").resolve()
print(f"Loading data from source file {str(in_path)}...")
2020-02-23 20:45:50 +01:00
in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)
print(f"Finished loading {len(in_df)} records with {len(in_df.columns)} columns.")
2020-02-23 20:45:50 +01:00
#
# Generate derived features
2020-02-23 20:45:50 +01:00
#
2020-12-14 20:42:14 +01:00
if "kline" in P.feature_sets:
2020-09-13 18:22:25 +02:00
print(f"Generating klines features...")
k_features = generate_features(
in_df, use_differences=False,
base_window=App.config["base_window_kline"], windows=App.config["windows_kline"],
area_windows=App.config["area_windows_kline"]
)
2020-09-13 18:22:25 +02:00
print(f"Finished generating {len(k_features)} kline features")
else:
k_features = []
2020-12-14 20:42:14 +01:00
if "futur" in P.feature_sets:
2020-09-13 18:22:25 +02:00
print(f"Generating futur features...")
f_features = generate_features_futur(in_df)
2020-09-13 18:22:25 +02:00
print(f"Finished generating {len(f_features)} futur features")
else:
f_features = []
2020-12-14 20:42:14 +01:00
if "depth" in P.feature_sets:
2020-09-13 18:22:25 +02:00
print(f"Generating depth features...")
d_features = generate_features_depth(in_df)
2020-09-13 18:22:25 +02:00
print(f"Finished generating {len(f_features)} depth features")
else:
d_features = []
2020-02-23 20:45:50 +01:00
#
# Generate labels (always the same, currently based on kline data which must be therefore present)
2020-02-23 20:45:50 +01:00
#
print(f"Generating labels...")
2020-11-14 21:06:33 +01:00
labels = []
2020-11-14 21:06:33 +01:00
# Binary labels whether max has exceeded a threshold or not
labels += generate_labels_thresholds(in_df, horizon=App.config["label_horizon"])
2020-11-14 21:06:33 +01:00
# Numeric label which is a ratio between areas over and under the latest price
2020-11-15 20:23:45 +01:00
labels += add_area_ratio(in_df, is_future=True, column_name="close", windows=[60, 120, 180, 300], suffix = "_area_future")
2020-02-23 20:45:50 +01:00
print(f"Finished generating {len(labels)} labels")
2020-02-23 20:45:50 +01:00
#
# Store feature matrix in output file
#
2021-09-09 20:48:11 +02:00
out_file_name = f"{symbol}-{freq}-features.csv"
out_file = (data_path / out_file_name).resolve()
2020-02-23 20:45:50 +01:00
2021-03-07 10:44:32 +01:00
print(f"Storing feature matrix with {len(in_df)} records and {len(in_df.columns)} columns in output file...")
in_df.to_csv(out_file, index=False, float_format="%.4f")
2020-02-23 20:45:50 +01:00
#in_df.to_parquet(out_path.with_suffix('.parquet'), engine='auto', compression=None, index=None, partition_cols=None)
2020-02-23 20:45:50 +01:00
elapsed = datetime.now() - start_dt
print(f"Finished feature generation in {int(elapsed.total_seconds())} seconds")
print(f"Output file location: {out_file}")
2020-02-23 20:45:50 +01:00
2021-09-09 08:20:29 +02:00
2020-02-23 20:45:50 +01:00
if __name__ == '__main__':
2021-09-09 20:48:11 +02:00
main()