2020-02-23 20:45:50 +01:00
import os
from datetime import datetime , timezone , timedelta
from typing import Union
import json
import numpy as np
import pandas as pd
2020-11-21 14:16:28 +01:00
from common . utils import *
2020-03-25 20:38:40 +01:00
2020-02-23 20:45:50 +01:00
"""
Feature / label generation .
These features are computed using explict transformations .
2020-03-25 20:38:40 +01:00
( True ) labels are features computed from future data but stored as properties of the current row ( in contrast to normal features which are computed from past data ) .
( Currently ) feature / label generation is not based on ( explicit ) models - all parameters are hard - coded .
Also , no parameter training is performed .
2020-02-23 20:45:50 +01:00
"""
2021-10-23 21:51:31 +02:00
2022-03-13 17:34:02 +01:00
def generate_features ( df , use_differences , base_window , windows , area_windows ) :
2020-02-23 20:45:50 +01:00
"""
Generate derived features by adding them as new columns to the data frame .
2022-03-13 17:34:02 +01:00
It is important that the same parameters are used for both training and prediction .
2020-02-23 20:45:50 +01:00
2022-03-13 17:34:02 +01:00
Most features compute rolling aggregation . However , instead of absolute values , the difference
of this rolling aggregation to the ( longer ) base rolling aggregation is computed .
The window sizes are used for encoding feature / column names and might look like ' close_120 '
for average close price for the last 120 minutes ( relative to the average base price ) .
The column names are needed when preparing data for training or prediction .
The easiest way to get them is to return from this function and copy and the
corresponding config attribute .
"""
2020-02-23 20:45:50 +01:00
features = [ ]
to_drop = [ ]
if use_differences :
df [ ' close ' ] = to_diff ( df [ ' close ' ] )
df [ ' volume ' ] = to_diff ( df [ ' volume ' ] )
df [ ' trades ' ] = to_diff ( df [ ' trades ' ] )
# close mean
2020-05-09 20:14:22 +02:00
weight_column_name = ' volume ' # None: no weighting; 'volume': volume average
2021-03-07 10:44:32 +01:00
to_drop + = add_past_weighted_aggregations ( df , ' close ' , weight_column_name , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_weighted_aggregations ( df , ' close ' , weight_column_name , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['close_1', 'close_2', 'close_5', 'close_20', 'close_60', 'close_180']
# close std
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' close ' , np . nanstd , base_window ) # Base column
features + = add_past_aggregations ( df , ' close ' , np . nanstd , windows [ 1 : ] , ' _std ' , to_drop [ - 1 ] , 100.0 ) # window 1 excluded
2020-02-23 20:45:50 +01:00
# ['close_std_1', 'close_std_2', 'close_std_5', 'close_std_20', 'close_std_60', 'close_std_180']
# volume mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' volume ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' volume ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['volume_1', 'volume_2', 'volume_5', 'volume_20', 'volume_60', 'volume_180']
# Span: high-low difference
df [ ' span ' ] = df [ ' high ' ] - df [ ' low ' ]
2020-05-09 20:14:22 +02:00
to_drop . append ( ' span ' )
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' span ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' span ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['span_1', 'span_2', 'span_5', 'span_20', 'span_60', 'span_180']
# Number of trades
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' trades ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' trades ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['trades_1', 'trades_2', 'trades_5', 'trades_20', 'trades_60', 'trades_180']
# tb_base_av / volume varies around 0.5 in base currency
df [ ' tb_base ' ] = df [ ' tb_base_av ' ] / df [ ' volume ' ]
to_drop . append ( ' tb_base ' )
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' tb_base ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' tb_base ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['tb_base_1', 'tb_base_2', 'tb_base_5', 'tb_base_20', 'tb_base_60', 'tb_base_180']
# tb_quote_av / quote_av varies around 0.5 in quote currency
df [ ' tb_quote ' ] = df [ ' tb_quote_av ' ] / df [ ' quote_av ' ]
to_drop . append ( ' tb_quote ' )
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' tb_quote ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' tb_quote ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-02-23 20:45:50 +01:00
# ['tb_quote_1', 'tb_quote_2', 'tb_quote_5', 'tb_quote_20', 'tb_quote_60', 'tb_quote_180']
2020-11-15 20:23:45 +01:00
# Area over and under latest close price
2022-03-13 17:34:02 +01:00
features + = add_area_ratio ( df , is_future = False , column_name = " close " , windows = area_windows , suffix = " _area " )
2020-11-15 20:23:45 +01:00
2020-12-27 11:37:14 +01:00
# Linear trend
features + = add_linear_trends ( df , is_future = False , column_name = " close " , windows = windows [ 1 : ] , suffix = " _trend " ) # window 1 excluded
2020-05-09 20:14:22 +02:00
df . drop ( columns = to_drop , inplace = True )
return features
2021-10-23 21:51:31 +02:00
2020-05-09 20:14:22 +02:00
def generate_features_futur ( df , use_differences = False ) :
"""
Generate derived features for futures .
"""
# Parameters of moving averages
2020-09-13 18:22:25 +02:00
windows = [ 1 , 2 , 5 , 20 , 60 , 180 ]
base_window = 360
2020-05-09 20:14:22 +02:00
features = [ ]
to_drop = [ ]
if use_differences :
df [ ' f_close ' ] = to_diff ( df [ ' f_close ' ] )
df [ ' f_volume ' ] = to_diff ( df [ ' f_volume ' ] )
df [ ' f_trades ' ] = to_diff ( df [ ' f_trades ' ] )
# close mean
weight_column_name = ' f_volume ' # None: no weighting; 'volume': volume average
2021-03-07 10:44:32 +01:00
to_drop + = add_past_weighted_aggregations ( df , ' f_close ' , weight_column_name , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_weighted_aggregations ( df , ' f_close ' , weight_column_name , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-17 13:16:58 +02:00
# ['f_close_1', f_close_2', 'f_close_5', 'f_close_10', 'f_close_20']
2020-05-09 20:14:22 +02:00
# close std
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' f_close ' , np . nanstd , base_window ) # Base column
features + = add_past_aggregations ( df , ' f_close ' , np . nanstd , windows [ 1 : ] , ' _std ' , to_drop [ - 1 ] , 100.0 ) # window 1 excluded
2020-05-17 13:16:58 +02:00
# ['f_close_std_1', f_close_std_2', 'f_close_std_5', 'f_close_std_10', 'f_close_std_20']
2020-05-09 20:14:22 +02:00
# volume mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' f_volume ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' f_volume ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-17 13:16:58 +02:00
# ['f_volume_1', 'f_volume_2', 'f_volume_5', 'f_volume_10', 'f_volume_20']
2020-05-09 20:14:22 +02:00
# Span: high-low difference
df [ ' f_span ' ] = df [ ' f_high ' ] - df [ ' f_low ' ]
to_drop . append ( ' f_span ' )
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' f_span ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' f_span ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-17 13:16:58 +02:00
# ['f_span_1', 'f_span_2', 'f_span_5', 'f_span_10', 'f_span_20']
2020-05-09 20:14:22 +02:00
# Number of trades
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' f_trades ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' f_trades ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-17 13:16:58 +02:00
# ['f_trades_1', 'f_trades_2', 'f_trades_5', 'f_trades_10', 'f_trades_20']
2020-05-09 20:14:22 +02:00
# tb_base_av / volume varies around 0.5 in base currency
#df['f_tb_base'] = df['f_tb_base_av'] / df['f_volume']
#to_drop.append('f_tb_base')
2021-03-07 10:44:32 +01:00
#to_drop += add_past_aggregations(df, 'f_tb_base', np.nanmean, base_window, suffix='') # Base column
#features += add_past_aggregations(df, 'f_tb_base', np.nanmean, windows, '', to_drop[-1], 100.0)
2020-05-17 13:16:58 +02:00
# ['f_tb_base_1', 'f_tb_base_2', 'f_tb_base_5', 'f_tb_base_10', 'f_tb_base_20']
2020-05-09 20:14:22 +02:00
# tb_quote_av / quote_av varies around 0.5 in quote currency
#df['f_tb_quote'] = df['f_tb_quote_av'] / df['f_quote_av']
#to_drop.append('f_tb_quote')
2021-03-07 10:44:32 +01:00
#to_drop += add_past_aggregations(df, 'f_tb_quote', np.nanmean, base_window, suffix='') # Base column
#features += add_past_aggregations(df, 'f_tb_quote', np.nanmean, windows, '', to_drop[-1], 100.0)
2020-05-17 13:16:58 +02:00
# ['f_tb_quote_1', 'f_tb_quote_2', 'f_tb_quote_5', 'f_tb_quote_10', 'f_tb_quote_20']
2020-05-09 20:14:22 +02:00
2020-11-19 19:05:11 +01:00
# Area over and under latest close price
features + = add_area_ratio ( df , is_future = False , column_name = " f_close " , windows = [ 20 , 60 , 120 , 180 ] , suffix = " _area " )
2020-12-27 11:37:14 +01:00
# Linear trend
features + = add_linear_trends ( df , is_future = False , column_name = " f_close " , windows = windows [ 1 : ] , suffix = " _trend " ) # window 1 excluded
2020-05-09 20:14:22 +02:00
df . drop ( columns = to_drop , inplace = True )
return features
2021-10-23 21:51:31 +02:00
2020-05-09 20:14:22 +02:00
def generate_features_depth ( df , use_differences = False ) :
"""
Generate derived features from depth data .
Original features :
- gap , price ,
- bids_1 , asks_1 ,
- bids_2 , asks_2 ,
- bids_5 , asks_5 ,
- bids_10 , asks_10 ,
- bids_20 , asks_20
Features ( 33 ) :
gap_2 , gap_5 , gap_10 ,
bids_1_2 , bids_1_5 , bids_1_10 , asks_1_2 , asks_1_5 , asks_1_10 ,
bids_2_2 , bids_2_5 , bids_2_10 , asks_2_2 , asks_2_5 , asks_2_10 ,
bids_5_2 , bids_5_5 , bids_5_10 , asks_5_2 , asks_5_5 , asks_5_10 ,
bids_10_2 , bids_10_5 , bids_10_10 , asks_10_2 , asks_10_5 , asks_10_10 ,
bids_20_2 , bids_20_5 , bids_20_10 , asks_20_2 , asks_20_5 , asks_20_10 ,
"""
# Parameters of moving averages
windows = [ 2 , 5 , 10 ]
base_window = 30
features = [ ]
to_drop = [ ]
# gap mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' gap ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' gap ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['gap_2', 'gap_5', 'gap_10']
# bids_1 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' bids_1 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' bids_1 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['bids_1_2', 'bids_1_5', 'bids_1_10']
# asks_1 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' asks_1 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' asks_1 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['asks_1_2', 'asks_1_5', 'asks_1_10']
# bids_2 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' bids_2 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' bids_2 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['bids_2_2', 'bids_2_5', 'bids_2_10']
# asks_2 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' asks_2 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' asks_2 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['asks_2_2', 'asks_2_5', 'asks_2_10']
# bids_5 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' bids_5 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' bids_5 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['bids_5_2', 'bids_5_5', 'bids_5_10']
# asks_5 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' asks_5 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' asks_5 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['asks_5_2', 'asks_5_5', 'asks_5_10']
# bids_10 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' bids_10 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' bids_10 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['bids_10_2', 'bids_10_5', 'bids_10_10']
# asks_10 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' asks_10 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' asks_10 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['asks_10_2', 'asks_10_5', 'asks_10_10']
# bids_20 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' bids_20 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' bids_20 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['bids_20_2', 'bids_20_5', 'bids_20_10']
# asks_20 mean
2021-03-07 10:44:32 +01:00
to_drop + = add_past_aggregations ( df , ' asks_20 ' , np . nanmean , base_window , suffix = ' ' ) # Base column
features + = add_past_aggregations ( df , ' asks_20 ' , np . nanmean , windows , ' ' , to_drop [ - 1 ] , 100.0 )
2020-05-09 20:14:22 +02:00
# ['asks_20_2', 'asks_20_5', 'asks_20_10']
2020-02-23 20:45:50 +01:00
df . drop ( columns = to_drop , inplace = True )
return features
2021-10-23 21:51:31 +02:00
2020-05-02 15:12:07 +02:00
def depth_to_df ( depth : list ) :
"""
Input is a list of json objects each representing current market depth with a list bids and asks
The method computes features from the market depth and returns a data frame with the corresponding columns .
NOTE :
- Important : " timestamp " is real time of the depth data which corresponds to " close_time " in klines
" timestamp " in klines is 1 m before current time
It has to be taken into account when matching / joining records , e . g . , by shifting columns ( if we match " timestamp " then the reslt will be wrong )
- data frame index is continuous and may contain gaps . its start is first line and end is last line
# TODO Questions:
# !!! - what is zone for our timestamps - ensure that it is the same as Binance server
# - is it possible to create a data frame with a column containing json object or string?
# - how to match json/string values with data frame index?
"""
bin_size = 1.0 # In USDT
windows = [ 1 , 2 , 5 , 10 , 20 ] # No of price bins for aggregate/smoothing
#
# Generate a table with feature records
#
table = [ ]
for entry in depth :
record = depth_to_features ( entry , windows , bin_size )
table . append ( record )
#
# Convert json table to data frame
#
df = pd . DataFrame . from_dict ( table )
# Alternatively, from_records() or json_normalize()
# Timestamp is an index
df [ " timestamp " ] = pd . to_datetime ( df [ " timestamp " ] , unit = ' ms ' )
df = df . set_index ( " timestamp " )
df = df . sort_index ( )
#
# Find start and end dates
#
# NOTE: timestamp is request time (in our implementation) and hence it is end of 1m interval while kline id is start of 1m inteval
# It is important for matching, so maybe align this difference here by shifting data
start_line = depth [ 0 ]
end_line = depth [ - 1 ]
start_ts = start_line . get ( " timestamp " )
#start_ts -= 60_000 # To ensure that we do not lose any data
end_ts = end_line . get ( " timestamp " )
#end_ts += 60_000 # To ensure that we do not lose any data
#
# Create index for this interval of timestamps
#
# NOTE: Add utc=True to get tz-aware object (with tz="UTC" instead of tz-unaware object with tz=None), so it seems that no tz means UTC
start = pd . to_datetime ( start_ts , unit = ' ms ' )
end = pd . to_datetime ( end_ts , unit = ' ms ' )
# Alternatively:
# If tz is not specified then 1 hour difference will be added so it seems that no tz means locale tz
#datetime.fromtimestamp(float(start_ts) / 1e3, tz=pytz.UTC)
# Create DatetimeIndex
# NOTE: if tz is not specified then the index is tz-naive
# closed can be specified (which side to include/exclude: left, right or both). it influences if we want ot include/exclude start or end of the interval
index = pd . date_range ( start , end , freq = " T " )
df_out = pd . DataFrame ( index = index )
#
# Join data with this empty index (to ensure continuous range of timestamps)
#
df_out = df_out . join ( df )
return df_out
2021-10-23 21:51:31 +02:00
2020-05-02 15:12:07 +02:00
def depth_to_features ( entry : list , windows : list , bin_size : float ) :
""" Convert one record of market depth to a dict of features """
bids = entry . get ( " bids " )
asks = entry . get ( " asks " )
timestamp = entry . get ( " timestamp " )
# Gap feature
gap = asks [ 0 ] [ 0 ] - bids [ 0 ] [ 0 ]
if gap < 0 : gap = 0
# Price feature
price = bids [ 0 ] [ 0 ] + ( gap / 2 )
# Densities for bids and asks (volume per price unit)
densities = mean_volumes ( depth = entry , windows = windows , bin_size = bin_size )
record = { " timestamp " : timestamp , " gap " : gap , " price " : price }
record . update ( densities )
return record
2020-02-23 20:45:50 +01:00
if __name__ == " __main__ " :
pass