2020-02-23 20:45:50 +01:00
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
|
from typing import Union
|
|
|
|
|
import json
|
|
|
|
|
import pickle
|
2021-09-09 20:48:11 +02:00
|
|
|
import click
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
2021-09-09 20:48:11 +02:00
|
|
|
from service.App import *
|
2020-12-13 19:30:06 +01:00
|
|
|
from common.feature_generation import *
|
|
|
|
|
from common.label_generation import *
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
#
|
|
|
|
|
# Parameters
|
|
|
|
|
#
|
|
|
|
|
class P:
|
2021-01-02 21:25:55 +01:00
|
|
|
feature_sets = ["kline", ] # "futur"
|
2020-05-09 20:14:22 +02:00
|
|
|
|
2021-10-09 14:01:10 +02:00
|
|
|
in_nrows = 10_000_000
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
|
2021-09-09 20:48:11 +02:00
|
|
|
@click.command()
|
|
|
|
|
@click.option('--config_file', '-c', type=click.Path(), default='', help='Configuration file name')
|
|
|
|
|
def main(config_file):
|
|
|
|
|
load_config(config_file)
|
2020-02-23 20:45:50 +01:00
|
|
|
|
2021-09-09 20:48:11 +02:00
|
|
|
freq = "1m"
|
|
|
|
|
symbol = App.config["symbol"]
|
|
|
|
|
data_path = Path(App.config["data_folder"])
|
|
|
|
|
if not data_path.is_dir():
|
|
|
|
|
print(f"Data folder does not exist: {data_path}")
|
|
|
|
|
return
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
start_dt = datetime.now()
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
# Load historic data
|
|
|
|
|
#
|
2021-09-09 20:48:11 +02:00
|
|
|
in_path = (data_path / f"{symbol}-{freq}.csv").resolve()
|
2021-10-09 14:01:10 +02:00
|
|
|
|
|
|
|
|
print(f"Loading data from source file {str(in_path)}...")
|
|
|
|
|
|
2020-02-23 20:45:50 +01:00
|
|
|
in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)
|
|
|
|
|
|
2020-05-17 16:52:30 +02:00
|
|
|
print(f"Finished loading {len(in_df)} records with {len(in_df.columns)} columns.")
|
|
|
|
|
|
2020-02-23 20:45:50 +01:00
|
|
|
#
|
2020-05-17 16:52:30 +02:00
|
|
|
# Generate derived features
|
2020-02-23 20:45:50 +01:00
|
|
|
#
|
2020-05-17 16:52:30 +02:00
|
|
|
|
2020-12-14 20:42:14 +01:00
|
|
|
if "kline" in P.feature_sets:
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Generating klines features...")
|
2020-05-17 16:52:30 +02:00
|
|
|
k_features = generate_features(in_df)
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Finished generating {len(k_features)} kline features")
|
2020-05-17 16:52:30 +02:00
|
|
|
else:
|
|
|
|
|
k_features = []
|
|
|
|
|
|
2020-12-14 20:42:14 +01:00
|
|
|
if "futur" in P.feature_sets:
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Generating futur features...")
|
2020-05-17 16:52:30 +02:00
|
|
|
f_features = generate_features_futur(in_df)
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Finished generating {len(f_features)} futur features")
|
2020-05-17 16:52:30 +02:00
|
|
|
else:
|
|
|
|
|
f_features = []
|
|
|
|
|
|
2020-12-14 20:42:14 +01:00
|
|
|
if "depth" in P.feature_sets:
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Generating depth features...")
|
2020-05-17 16:52:30 +02:00
|
|
|
d_features = generate_features_depth(in_df)
|
2020-09-13 18:22:25 +02:00
|
|
|
print(f"Finished generating {len(f_features)} depth features")
|
2020-05-17 16:52:30 +02:00
|
|
|
else:
|
|
|
|
|
d_features = []
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
#
|
2020-05-17 16:52:30 +02:00
|
|
|
# Generate labels (always the same, currently based on kline data which must be therefore present)
|
2020-02-23 20:45:50 +01:00
|
|
|
#
|
|
|
|
|
print(f"Generating labels...")
|
2020-11-14 21:06:33 +01:00
|
|
|
labels = []
|
2020-05-17 16:52:30 +02:00
|
|
|
|
2020-11-14 21:06:33 +01:00
|
|
|
# Binary labels whether max has exceeded a threshold or not
|
|
|
|
|
labels += generate_labels_thresholds(in_df, horizon=180)
|
|
|
|
|
|
|
|
|
|
# Numeric label which is ration between areas over and under the latest price
|
2020-11-15 20:23:45 +01:00
|
|
|
labels += add_area_ratio(in_df, is_future=True, column_name="close", windows=[60, 120, 180, 300], suffix = "_area_future")
|
2020-02-23 20:45:50 +01:00
|
|
|
|
2020-05-17 16:52:30 +02:00
|
|
|
print(f"Finished generating {len(labels)} labels")
|
|
|
|
|
|
2020-02-23 20:45:50 +01:00
|
|
|
#
|
|
|
|
|
# Store feature matrix in output file
|
|
|
|
|
#
|
2021-09-09 20:48:11 +02:00
|
|
|
out_file_name = f"{symbol}-{freq}-features.csv"
|
2021-10-10 11:47:46 +02:00
|
|
|
out_file = (data_path / out_file_name).resolve()
|
2020-02-23 20:45:50 +01:00
|
|
|
|
2021-03-07 10:44:32 +01:00
|
|
|
print(f"Storing feature matrix with {len(in_df)} records and {len(in_df.columns)} columns in output file...")
|
|
|
|
|
|
2021-10-10 11:47:46 +02:00
|
|
|
in_df.to_csv(out_file, index=False, float_format="%.4f")
|
2020-02-23 20:45:50 +01:00
|
|
|
|
2020-11-29 21:15:13 +01:00
|
|
|
#in_df.to_parquet(out_path.with_suffix('.parquet'), engine='auto', compression=None, index=None, partition_cols=None)
|
2020-02-23 20:45:50 +01:00
|
|
|
|
|
|
|
|
elapsed = datetime.now() - start_dt
|
|
|
|
|
print(f"Finished feature generation in {int(elapsed.total_seconds())} seconds")
|
2021-10-10 11:47:46 +02:00
|
|
|
print(f"Output file location: {out_file}")
|
2020-02-23 20:45:50 +01:00
|
|
|
|
2021-09-09 08:20:29 +02:00
|
|
|
|
2020-02-23 20:45:50 +01:00
|
|
|
if __name__ == '__main__':
|
2021-09-09 20:48:11 +02:00
|
|
|
main()
|