intelligent-trading-bot/scripts/generate_features.py
2022-03-13 20:45:12 +01:00

111 lines
3.3 KiB
Python

import sys
from pathlib import Path
from datetime import datetime, timezone, timedelta
from typing import Union
import json
import pickle
import click
import numpy as np
import pandas as pd
from service.App import *
from common.feature_generation import *
from common.label_generation import *
#
# Parameters
#
class P:
feature_sets = ["kline", ] # "futur"
in_nrows = 10_000_000
@click.command()
@click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
def main(config_file):
load_config(config_file)
freq = "1m"
symbol = App.config["symbol"]
data_path = Path(App.config["data_folder"])
if not data_path.is_dir():
print(f"Data folder does not exist: {data_path}")
return
start_dt = datetime.now()
#
# Load historic data
#
in_path = (data_path / f"{symbol}-{freq}.csv").resolve()
print(f"Loading data from source file {str(in_path)}...")
in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)
print(f"Finished loading {len(in_df)} records with {len(in_df.columns)} columns.")
#
# Generate derived features
#
if "kline" in P.feature_sets:
print(f"Generating klines features...")
k_features = generate_features(
in_df, use_differences=False,
base_window=App.config["base_window_kline"], windows=App.config["windows_kline"],
area_windows=App.config["area_windows_kline"]
)
print(f"Finished generating {len(k_features)} kline features")
else:
k_features = []
if "futur" in P.feature_sets:
print(f"Generating futur features...")
f_features = generate_features_futur(in_df)
print(f"Finished generating {len(f_features)} futur features")
else:
f_features = []
if "depth" in P.feature_sets:
print(f"Generating depth features...")
d_features = generate_features_depth(in_df)
print(f"Finished generating {len(f_features)} depth features")
else:
d_features = []
#
# Generate labels (always the same, currently based on kline data which must be therefore present)
#
print(f"Generating labels...")
labels = []
# Binary labels whether max has exceeded a threshold or not
labels += generate_labels_thresholds(in_df, horizon=App.config["label_horizon"])
# Numeric label which is a ratio between areas over and under the latest price
labels += add_area_ratio(in_df, is_future=True, column_name="close", windows=[60, 120, 180, 300], suffix = "_area_future")
print(f"Finished generating {len(labels)} labels")
#
# Store feature matrix in output file
#
out_file_name = f"{symbol}-{freq}-features.csv"
out_file = (data_path / out_file_name).resolve()
print(f"Storing feature matrix with {len(in_df)} records and {len(in_df.columns)} columns in output file...")
in_df.to_csv(out_file, index=False, float_format="%.4f")
#in_df.to_parquet(out_path.with_suffix('.parquet'), engine='auto', compression=None, index=None, partition_cols=None)
elapsed = datetime.now() - start_dt
print(f"Finished feature generation in {int(elapsed.total_seconds())} seconds")
print(f"Output file location: {out_file}")
if __name__ == '__main__':
main()