intelligent-trading-bot/common/feature_generation.py

import os
from datetime import datetime, timezone, timedelta
from typing import Union
import json

import numpy as np
import pandas as pd

from common.utils import *

"""
Feature/label generation.
These features are computed using explict transformations.
(True) labels are features computed from future data but stored as properties of the current row (in contrast to normal features which are computed from past data).
(Currently) feature/label generation is not based on (explicit) models - all parameters are hard-coded.
Also, no parameter training is performed.
"""


def generate_features(df,use_differences, base_window, windows, area_windows):
    """
    Generate derived features by adding them as new columns to the data frame.
    It is important that the same parameters are used for both training and prediction.

    Most features compute rolling aggregation. However, instead of absolute values, the difference
    of this rolling aggregation to the (longer) base rolling aggregation is computed.

    The window sizes are used for encoding feature/column names and might look like 'close_120'
    for average close price for the last 120 minutes (relative to the average base price).
    The column names are needed when preparing data for training or prediction.
    The easiest way to get them is to return from this function and copy and the
    corresponding config attribute.
    """
    features = []
    to_drop = []

    if use_differences:
        df['close'] = to_diff(df['close'])
        df['volume'] = to_diff(df['volume'])
        df['trades'] = to_diff(df['trades'])

    # close mean
    weight_column_name = 'volume'  # None: no weighting; 'volume': volume average
    to_drop += add_past_weighted_aggregations(df, 'close', weight_column_name, np.nanmean, base_window, suffix='')  # Base column
    features += add_past_weighted_aggregations(df, 'close', weight_column_name, np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['close_1', 'close_2', 'close_5', 'close_20', 'close_60', 'close_180']

    # close std
    to_drop += add_past_aggregations(df, 'close', np.nanstd, base_window)  # Base column
    features += add_past_aggregations(df, 'close', np.nanstd, windows[1:], '_std', to_drop[-1], 100.0)  # window 1 excluded
    # ['close_std_1', 'close_std_2', 'close_std_5', 'close_std_20', 'close_std_60', 'close_std_180']

    # volume mean
    to_drop += add_past_aggregations(df, 'volume', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'volume', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['volume_1', 'volume_2', 'volume_5', 'volume_20', 'volume_60', 'volume_180']

    # Span: high-low difference
    df['span'] = df['high'] - df['low']
    to_drop.append('span')
    to_drop += add_past_aggregations(df, 'span', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'span', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['span_1', 'span_2', 'span_5', 'span_20', 'span_60', 'span_180']

    # Number of trades
    to_drop += add_past_aggregations(df, 'trades', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'trades', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['trades_1', 'trades_2', 'trades_5', 'trades_20', 'trades_60', 'trades_180']

    # tb_base_av / volume varies around 0.5 in base currency
    df['tb_base'] = df['tb_base_av'] / df['volume']
    to_drop.append('tb_base')
    to_drop += add_past_aggregations(df, 'tb_base', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'tb_base', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['tb_base_1', 'tb_base_2', 'tb_base_5', 'tb_base_20', 'tb_base_60', 'tb_base_180']

    # tb_quote_av / quote_av varies around 0.5 in quote currency
    df['tb_quote'] = df['tb_quote_av'] / df['quote_av']
    to_drop.append('tb_quote')
    to_drop += add_past_aggregations(df, 'tb_quote', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'tb_quote', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['tb_quote_1', 'tb_quote_2', 'tb_quote_5', 'tb_quote_20', 'tb_quote_60', 'tb_quote_180']

    # Area over and under latest close price
    features += add_area_ratio(df, is_future=False, column_name="close", windows=area_windows, suffix = "_area")

    # Linear trend
    features += add_linear_trends(df, is_future=False, column_name="close", windows=windows[1:], suffix="_trend")  # window 1 excluded

    df.drop(columns=to_drop, inplace=True)

    return features


def generate_features_futur(df, use_differences=False):
    """
    Generate derived features for futures.
    """
    # Parameters of moving averages
    windows = [1, 2, 5, 20, 60, 180]
    base_window = 360

    features = []
    to_drop = []

    if use_differences:
        df['f_close'] = to_diff(df['f_close'])
        df['f_volume'] = to_diff(df['f_volume'])
        df['f_trades'] = to_diff(df['f_trades'])

    # close mean
    weight_column_name = 'f_volume'  # None: no weighting; 'volume': volume average
    to_drop += add_past_weighted_aggregations(df, 'f_close', weight_column_name, np.nanmean, base_window, suffix='')  # Base column
    features += add_past_weighted_aggregations(df, 'f_close', weight_column_name, np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_close_1', f_close_2', 'f_close_5', 'f_close_10', 'f_close_20']

    # close std
    to_drop += add_past_aggregations(df, 'f_close', np.nanstd, base_window)  # Base column
    features += add_past_aggregations(df, 'f_close', np.nanstd, windows[1:], '_std', to_drop[-1], 100.0)  # window 1 excluded
    # ['f_close_std_1', f_close_std_2', 'f_close_std_5', 'f_close_std_10', 'f_close_std_20']

    # volume mean
    to_drop += add_past_aggregations(df, 'f_volume', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'f_volume', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_volume_1', 'f_volume_2', 'f_volume_5', 'f_volume_10', 'f_volume_20']

    # Span: high-low difference
    df['f_span'] = df['f_high'] - df['f_low']
    to_drop.append('f_span')
    to_drop += add_past_aggregations(df, 'f_span', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'f_span', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_span_1', 'f_span_2', 'f_span_5', 'f_span_10', 'f_span_20']

    # Number of trades
    to_drop += add_past_aggregations(df, 'f_trades', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'f_trades', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_trades_1', 'f_trades_2', 'f_trades_5', 'f_trades_10', 'f_trades_20']

    # tb_base_av / volume varies around 0.5 in base currency
    #df['f_tb_base'] = df['f_tb_base_av'] / df['f_volume']
    #to_drop.append('f_tb_base')
    #to_drop += add_past_aggregations(df, 'f_tb_base', np.nanmean, base_window, suffix='')  # Base column
    #features += add_past_aggregations(df, 'f_tb_base', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_tb_base_1', 'f_tb_base_2', 'f_tb_base_5', 'f_tb_base_10', 'f_tb_base_20']

    # tb_quote_av / quote_av varies around 0.5 in quote currency
    #df['f_tb_quote'] = df['f_tb_quote_av'] / df['f_quote_av']
    #to_drop.append('f_tb_quote')
    #to_drop += add_past_aggregations(df, 'f_tb_quote', np.nanmean, base_window, suffix='')  # Base column
    #features += add_past_aggregations(df, 'f_tb_quote', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['f_tb_quote_1', 'f_tb_quote_2', 'f_tb_quote_5', 'f_tb_quote_10', 'f_tb_quote_20']

    # Area over and under latest close price
    features += add_area_ratio(df, is_future=False, column_name="f_close", windows=[20, 60, 120, 180], suffix = "_area")

    # Linear trend
    features += add_linear_trends(df, is_future=False, column_name="f_close", windows=windows[1:], suffix="_trend")  # window 1 excluded

    df.drop(columns=to_drop, inplace=True)

    return features


def generate_features_depth(df, use_differences=False):
    """
    Generate derived features from depth data.
    Original features:
    - gap, price,
    - bids_1,asks_1,
    - bids_2,asks_2,
    - bids_5,asks_5,
    - bids_10,asks_10,
    - bids_20,asks_20

    Features (33):
    gap_2,gap_5,gap_10,
    bids_1_2,bids_1_5,bids_1_10, asks_1_2,asks_1_5,asks_1_10,
    bids_2_2,bids_2_5,bids_2_10, asks_2_2,asks_2_5,asks_2_10,
    bids_5_2,bids_5_5,bids_5_10, asks_5_2,asks_5_5,asks_5_10,
    bids_10_2,bids_10_5,bids_10_10, asks_10_2,asks_10_5,asks_10_10,
    bids_20_2,bids_20_5,bids_20_10, asks_20_2,asks_20_5,asks_20_10,
    """
    # Parameters of moving averages
    windows = [2, 5, 10]
    base_window = 30

    features = []
    to_drop = []

    # gap mean
    to_drop += add_past_aggregations(df, 'gap', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'gap', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['gap_2', 'gap_5', 'gap_10']


    # bids_1 mean
    to_drop += add_past_aggregations(df, 'bids_1', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'bids_1', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['bids_1_2', 'bids_1_5', 'bids_1_10']
    # asks_1 mean
    to_drop += add_past_aggregations(df, 'asks_1', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'asks_1', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['asks_1_2', 'asks_1_5', 'asks_1_10']


    # bids_2 mean
    to_drop += add_past_aggregations(df, 'bids_2', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'bids_2', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['bids_2_2', 'bids_2_5', 'bids_2_10']
    # asks_2 mean
    to_drop += add_past_aggregations(df, 'asks_2', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'asks_2', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['asks_2_2', 'asks_2_5', 'asks_2_10']


    # bids_5 mean
    to_drop += add_past_aggregations(df, 'bids_5', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'bids_5', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['bids_5_2', 'bids_5_5', 'bids_5_10']
    # asks_5 mean
    to_drop += add_past_aggregations(df, 'asks_5', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'asks_5', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['asks_5_2', 'asks_5_5', 'asks_5_10']


    # bids_10 mean
    to_drop += add_past_aggregations(df, 'bids_10', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'bids_10', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['bids_10_2', 'bids_10_5', 'bids_10_10']
    # asks_10 mean
    to_drop += add_past_aggregations(df, 'asks_10', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'asks_10', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['asks_10_2', 'asks_10_5', 'asks_10_10']


    # bids_20 mean
    to_drop += add_past_aggregations(df, 'bids_20', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'bids_20', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['bids_20_2', 'bids_20_5', 'bids_20_10']
    # asks_20 mean
    to_drop += add_past_aggregations(df, 'asks_20', np.nanmean, base_window, suffix='')  # Base column
    features += add_past_aggregations(df, 'asks_20', np.nanmean, windows, '', to_drop[-1], 100.0)
    # ['asks_20_2', 'asks_20_5', 'asks_20_10']


    df.drop(columns=to_drop, inplace=True)

    return features


def depth_to_df(depth: list):
    """
    Input is a list of json objects each representing current market depth with a list bids and asks
    The method computes features from the market depth and returns a data frame with the corresponding columns.

    NOTE:
    - Important: "timestamp" is real time of the depth data which corresponds to "close_time" in klines
      "timestamp" in klines is 1m before current time
      It has to be taken into account when matching/joining records, e.g., by shifting columns (if we match "timestamp" then the reslt will be wrong)
    - data frame index is continuous and may contain gaps. its start is first line and end is last line

    # TODO Questions:
    # !!! - what is zone for our timestamps - ensure that it is the same as Binance server
    # - is it possible to create a data frame with a column containing json object or string?
    # - how to match json/string values with data frame index?
    """
    bin_size = 1.0  # In USDT
    windows = [1, 2, 5, 10, 20]  # No of price bins for aggregate/smoothing

    #
    # Generate a table with feature records
    #
    table = []
    for entry in depth:
        record = depth_to_features(entry, windows, bin_size)
        table.append(record)

    #
    # Convert json table to data frame
    #
    df = pd.DataFrame.from_dict(table)
    # Alternatively, from_records() or json_normalize()

    # Timestamp is an index
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit='ms')
    df = df.set_index("timestamp")
    df = df.sort_index()

    #
    # Find start and end dates
    #
    # NOTE: timestamp is request time (in our implementation) and hence it is end of 1m interval while kline id is start of 1m inteval
    #  It is important for matching, so maybe align this difference here by shifting data
    start_line = depth[0]
    end_line = depth[-1]
    start_ts = start_line.get("timestamp")
    #start_ts -= 60_000  # To ensure that we do not lose any data
    end_ts = end_line.get("timestamp")
    #end_ts += 60_000  # To ensure that we do not lose any data

    #
    # Create index for this interval of timestamps
    #
    # NOTE: Add utc=True to get tz-aware object (with tz="UTC" instead of tz-unaware object with tz=None), so it seems that no tz means UTC
    start = pd.to_datetime(start_ts, unit='ms')
    end = pd.to_datetime(end_ts, unit='ms')

    # Alternatively:
    # If tz is not specified then 1 hour difference will be added so it seems that no tz means locale tz
    #datetime.fromtimestamp(float(start_ts) / 1e3, tz=pytz.UTC)

    # Create DatetimeIndex
    # NOTE: if tz is not specified then the index is tz-naive
    #   closed can be specified (which side to include/exclude: left, right or both). it influences if we want ot include/exclude start or end of the interval
    index = pd.date_range(start, end, freq="T")
    df_out = pd.DataFrame(index=index)

    #
    # Join data with this empty index (to ensure continuous range of timestamps)
    #
    df_out = df_out.join(df)

    return df_out


def depth_to_features(entry: list, windows: list, bin_size: float):
    """Convert one record of market depth to a dict of features"""

    bids = entry.get("bids")
    asks = entry.get("asks")

    timestamp = entry.get("timestamp")

    # Gap feature
    gap = asks[0][0] - bids[0][0]

    if gap < 0: gap = 0

    # Price feature
    price = bids[0][0] + (gap / 2)

    # Densities for bids and asks (volume per price unit)
    densities = mean_volumes(depth=entry, windows=windows, bin_size=bin_size)

    record = {"timestamp": timestamp, "gap": gap, "price": price}
    record.update(densities)

    return record


if __name__ == "__main__":
    pass
initial commit 2020-02-23 20:45:50 +01:00			`import os`
			`from datetime import datetime, timezone, timedelta`
			`from typing import Union`
			`import json`

			`import numpy as np`
			`import pandas as pd`

refactor grid search and common package 2020-11-21 14:16:28 +01:00			`from common.utils import *`
restructure feature/label generation and other numerous changes 2020-03-25 20:38:40 +01:00
initial commit 2020-02-23 20:45:50 +01:00			`"""`
			`Feature/label generation.`
			`These features are computed using explict transformations.`
restructure feature/label generation and other numerous changes 2020-03-25 20:38:40 +01:00			`(True) labels are features computed from future data but stored as properties of the current row (in contrast to normal features which are computed from past data).`
			`(Currently) feature/label generation is not based on (explicit) models - all parameters are hard-coded.`
			`Also, no parameter training is performed.`
initial commit 2020-02-23 20:45:50 +01:00			`"""`

formatting 2021-10-23 21:51:31 +02:00
move feature generation parameters to config 2022-03-13 17:34:02 +01:00			`def generate_features(df,use_differences, base_window, windows, area_windows):`
initial commit 2020-02-23 20:45:50 +01:00			`"""`
			`Generate derived features by adding them as new columns to the data frame.`
move feature generation parameters to config 2022-03-13 17:34:02 +01:00			`It is important that the same parameters are used for both training and prediction.`
initial commit 2020-02-23 20:45:50 +01:00
move feature generation parameters to config 2022-03-13 17:34:02 +01:00			`Most features compute rolling aggregation. However, instead of absolute values, the difference`
			`of this rolling aggregation to the (longer) base rolling aggregation is computed.`

			`The window sizes are used for encoding feature/column names and might look like 'close_120'`
			`for average close price for the last 120 minutes (relative to the average base price).`
			`The column names are needed when preparing data for training or prediction.`
			`The easiest way to get them is to return from this function and copy and the`
			`corresponding config attribute.`
			`"""`
initial commit 2020-02-23 20:45:50 +01:00			`features = []`
			`to_drop = []`

			`if use_differences:`
			`df['close'] = to_diff(df['close'])`
			`df['volume'] = to_diff(df['volume'])`
			`df['trades'] = to_diff(df['trades'])`

			`# close mean`
features for depth and futures 2020-05-09 20:14:22 +02:00			`weight_column_name = 'volume' # None: no weighting; 'volume': volume average`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_weighted_aggregations(df, 'close', weight_column_name, np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_weighted_aggregations(df, 'close', weight_column_name, np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['close_1', 'close_2', 'close_5', 'close_20', 'close_60', 'close_180']`

			`# close std`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'close', np.nanstd, base_window) # Base column`
			`features += add_past_aggregations(df, 'close', np.nanstd, windows[1:], '_std', to_drop[-1], 100.0) # window 1 excluded`
initial commit 2020-02-23 20:45:50 +01:00			`# ['close_std_1', 'close_std_2', 'close_std_5', 'close_std_20', 'close_std_60', 'close_std_180']`

			`# volume mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'volume', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'volume', np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['volume_1', 'volume_2', 'volume_5', 'volume_20', 'volume_60', 'volume_180']`

			`# Span: high-low difference`
			`df['span'] = df['high'] - df['low']`
features for depth and futures 2020-05-09 20:14:22 +02:00			`to_drop.append('span')`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'span', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'span', np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['span_1', 'span_2', 'span_5', 'span_20', 'span_60', 'span_180']`

			`# Number of trades`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'trades', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'trades', np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['trades_1', 'trades_2', 'trades_5', 'trades_20', 'trades_60', 'trades_180']`

			`# tb_base_av / volume varies around 0.5 in base currency`
			`df['tb_base'] = df['tb_base_av'] / df['volume']`
			`to_drop.append('tb_base')`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'tb_base', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'tb_base', np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['tb_base_1', 'tb_base_2', 'tb_base_5', 'tb_base_20', 'tb_base_60', 'tb_base_180']`

			`# tb_quote_av / quote_av varies around 0.5 in quote currency`
			`df['tb_quote'] = df['tb_quote_av'] / df['quote_av']`
			`to_drop.append('tb_quote')`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'tb_quote', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'tb_quote', np.nanmean, windows, '', to_drop[-1], 100.0)`
initial commit 2020-02-23 20:45:50 +01:00			`# ['tb_quote_1', 'tb_quote_2', 'tb_quote_5', 'tb_quote_20', 'tb_quote_60', 'tb_quote_180']`

update feature/label definitions 2020-11-15 20:23:45 +01:00			`# Area over and under latest close price`
move feature generation parameters to config 2022-03-13 17:34:02 +01:00			`features += add_area_ratio(df, is_future=False, column_name="close", windows=area_windows, suffix = "_area")`
update feature/label definitions 2020-11-15 20:23:45 +01:00
add linear trend feature definition 2020-12-27 11:37:14 +01:00			`# Linear trend`
			`features += add_linear_trends(df, is_future=False, column_name="close", windows=windows[1:], suffix="_trend") # window 1 excluded`

features for depth and futures 2020-05-09 20:14:22 +02:00			`df.drop(columns=to_drop, inplace=True)`

			`return features`

formatting 2021-10-23 21:51:31 +02:00
features for depth and futures 2020-05-09 20:14:22 +02:00			`def generate_features_futur(df, use_differences=False):`
			`"""`
			`Generate derived features for futures.`
			`"""`
			`# Parameters of moving averages`
update 2020-09-13 18:22:25 +02:00			`windows = [1, 2, 5, 20, 60, 180]`
			`base_window = 360`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`features = []`
			`to_drop = []`

			`if use_differences:`
			`df['f_close'] = to_diff(df['f_close'])`
			`df['f_volume'] = to_diff(df['f_volume'])`
			`df['f_trades'] = to_diff(df['f_trades'])`

			`# close mean`
			`weight_column_name = 'f_volume' # None: no weighting; 'volume': volume average`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_weighted_aggregations(df, 'f_close', weight_column_name, np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_weighted_aggregations(df, 'f_close', weight_column_name, np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_close_1', f_close_2', 'f_close_5', 'f_close_10', 'f_close_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# close std`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'f_close', np.nanstd, base_window) # Base column`
			`features += add_past_aggregations(df, 'f_close', np.nanstd, windows[1:], '_std', to_drop[-1], 100.0) # window 1 excluded`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_close_std_1', f_close_std_2', 'f_close_std_5', 'f_close_std_10', 'f_close_std_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# volume mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'f_volume', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'f_volume', np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_volume_1', 'f_volume_2', 'f_volume_5', 'f_volume_10', 'f_volume_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# Span: high-low difference`
			`df['f_span'] = df['f_high'] - df['f_low']`
			`to_drop.append('f_span')`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'f_span', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'f_span', np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_span_1', 'f_span_2', 'f_span_5', 'f_span_10', 'f_span_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# Number of trades`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'f_trades', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'f_trades', np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_trades_1', 'f_trades_2', 'f_trades_5', 'f_trades_10', 'f_trades_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# tb_base_av / volume varies around 0.5 in base currency`
			`#df['f_tb_base'] = df['f_tb_base_av'] / df['f_volume']`
			`#to_drop.append('f_tb_base')`
update 2021-03-07 10:44:32 +01:00			`#to_drop += add_past_aggregations(df, 'f_tb_base', np.nanmean, base_window, suffix='') # Base column`
			`#features += add_past_aggregations(df, 'f_tb_base', np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_tb_base_1', 'f_tb_base_2', 'f_tb_base_5', 'f_tb_base_10', 'f_tb_base_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
			`# tb_quote_av / quote_av varies around 0.5 in quote currency`
			`#df['f_tb_quote'] = df['f_tb_quote_av'] / df['f_quote_av']`
			`#to_drop.append('f_tb_quote')`
update 2021-03-07 10:44:32 +01:00			`#to_drop += add_past_aggregations(df, 'f_tb_quote', np.nanmean, base_window, suffix='') # Base column`
			`#features += add_past_aggregations(df, 'f_tb_quote', np.nanmean, windows, '', to_drop[-1], 100.0)`
processing multiple files 2020-05-17 13:16:58 +02:00			`# ['f_tb_quote_1', 'f_tb_quote_2', 'f_tb_quote_5', 'f_tb_quote_10', 'f_tb_quote_20']`
features for depth and futures 2020-05-09 20:14:22 +02:00
add logistic regression 2020-11-19 19:05:11 +01:00			`# Area over and under latest close price`
			`features += add_area_ratio(df, is_future=False, column_name="f_close", windows=[20, 60, 120, 180], suffix = "_area")`

add linear trend feature definition 2020-12-27 11:37:14 +01:00			`# Linear trend`
			`features += add_linear_trends(df, is_future=False, column_name="f_close", windows=windows[1:], suffix="_trend") # window 1 excluded`

features for depth and futures 2020-05-09 20:14:22 +02:00			`df.drop(columns=to_drop, inplace=True)`

			`return features`

formatting 2021-10-23 21:51:31 +02:00
features for depth and futures 2020-05-09 20:14:22 +02:00			`def generate_features_depth(df, use_differences=False):`
			`"""`
			`Generate derived features from depth data.`
			`Original features:`
			`- gap, price,`
			`- bids_1,asks_1,`
			`- bids_2,asks_2,`
			`- bids_5,asks_5,`
			`- bids_10,asks_10,`
			`- bids_20,asks_20`

			`Features (33):`
			`gap_2,gap_5,gap_10,`
			`bids_1_2,bids_1_5,bids_1_10, asks_1_2,asks_1_5,asks_1_10,`
			`bids_2_2,bids_2_5,bids_2_10, asks_2_2,asks_2_5,asks_2_10,`
			`bids_5_2,bids_5_5,bids_5_10, asks_5_2,asks_5_5,asks_5_10,`
			`bids_10_2,bids_10_5,bids_10_10, asks_10_2,asks_10_5,asks_10_10,`
			`bids_20_2,bids_20_5,bids_20_10, asks_20_2,asks_20_5,asks_20_10,`
			`"""`
			`# Parameters of moving averages`
			`windows = [2, 5, 10]`
			`base_window = 30`

			`features = []`
			`to_drop = []`

			`# gap mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'gap', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'gap', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['gap_2', 'gap_5', 'gap_10']`


			`# bids_1 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'bids_1', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'bids_1', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['bids_1_2', 'bids_1_5', 'bids_1_10']`
			`# asks_1 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'asks_1', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'asks_1', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['asks_1_2', 'asks_1_5', 'asks_1_10']`


			`# bids_2 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'bids_2', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'bids_2', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['bids_2_2', 'bids_2_5', 'bids_2_10']`
			`# asks_2 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'asks_2', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'asks_2', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['asks_2_2', 'asks_2_5', 'asks_2_10']`


			`# bids_5 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'bids_5', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'bids_5', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['bids_5_2', 'bids_5_5', 'bids_5_10']`
			`# asks_5 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'asks_5', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'asks_5', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['asks_5_2', 'asks_5_5', 'asks_5_10']`


			`# bids_10 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'bids_10', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'bids_10', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['bids_10_2', 'bids_10_5', 'bids_10_10']`
			`# asks_10 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'asks_10', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'asks_10', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['asks_10_2', 'asks_10_5', 'asks_10_10']`


			`# bids_20 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'bids_20', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'bids_20', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['bids_20_2', 'bids_20_5', 'bids_20_10']`
			`# asks_20 mean`
update 2021-03-07 10:44:32 +01:00			`to_drop += add_past_aggregations(df, 'asks_20', np.nanmean, base_window, suffix='') # Base column`
			`features += add_past_aggregations(df, 'asks_20', np.nanmean, windows, '', to_drop[-1], 100.0)`
features for depth and futures 2020-05-09 20:14:22 +02:00			`# ['asks_20_2', 'asks_20_5', 'asks_20_10']`


initial commit 2020-02-23 20:45:50 +01:00			`df.drop(columns=to_drop, inplace=True)`

			`return features`

formatting 2021-10-23 21:51:31 +02:00
finished feature generation for depth 2020-05-02 15:12:07 +02:00			`def depth_to_df(depth: list):`
			`"""`
			`Input is a list of json objects each representing current market depth with a list bids and asks`
			`The method computes features from the market depth and returns a data frame with the corresponding columns.`

			`NOTE:`
			`- Important: "timestamp" is real time of the depth data which corresponds to "close_time" in klines`
			`"timestamp" in klines is 1m before current time`
			`It has to be taken into account when matching/joining records, e.g., by shifting columns (if we match "timestamp" then the reslt will be wrong)`
			`- data frame index is continuous and may contain gaps. its start is first line and end is last line`

			`# TODO Questions:`
			`# !!! - what is zone for our timestamps - ensure that it is the same as Binance server`
			`# - is it possible to create a data frame with a column containing json object or string?`
			`# - how to match json/string values with data frame index?`
			`"""`
			`bin_size = 1.0 # In USDT`
			`windows = [1, 2, 5, 10, 20] # No of price bins for aggregate/smoothing`

			`#`
			`# Generate a table with feature records`
			`#`
			`table = []`
			`for entry in depth:`
			`record = depth_to_features(entry, windows, bin_size)`
			`table.append(record)`

			`#`
			`# Convert json table to data frame`
			`#`
			`df = pd.DataFrame.from_dict(table)`
			`# Alternatively, from_records() or json_normalize()`

			`# Timestamp is an index`
			`df["timestamp"] = pd.to_datetime(df["timestamp"], unit='ms')`
			`df = df.set_index("timestamp")`
			`df = df.sort_index()`

			`#`
			`# Find start and end dates`
			`#`
			`# NOTE: timestamp is request time (in our implementation) and hence it is end of 1m interval while kline id is start of 1m inteval`
			`# It is important for matching, so maybe align this difference here by shifting data`
			`start_line = depth[0]`
			`end_line = depth[-1]`
			`start_ts = start_line.get("timestamp")`
			`#start_ts -= 60_000 # To ensure that we do not lose any data`
			`end_ts = end_line.get("timestamp")`
			`#end_ts += 60_000 # To ensure that we do not lose any data`

			`#`
			`# Create index for this interval of timestamps`
			`#`
			`# NOTE: Add utc=True to get tz-aware object (with tz="UTC" instead of tz-unaware object with tz=None), so it seems that no tz means UTC`
			`start = pd.to_datetime(start_ts, unit='ms')`
			`end = pd.to_datetime(end_ts, unit='ms')`

			`# Alternatively:`
			`# If tz is not specified then 1 hour difference will be added so it seems that no tz means locale tz`
			`#datetime.fromtimestamp(float(start_ts) / 1e3, tz=pytz.UTC)`

			`# Create DatetimeIndex`
			`# NOTE: if tz is not specified then the index is tz-naive`
			`# closed can be specified (which side to include/exclude: left, right or both). it influences if we want ot include/exclude start or end of the interval`
			`index = pd.date_range(start, end, freq="T")`
			`df_out = pd.DataFrame(index=index)`

			`#`
			`# Join data with this empty index (to ensure continuous range of timestamps)`
			`#`
			`df_out = df_out.join(df)`

			`return df_out`

formatting 2021-10-23 21:51:31 +02:00
finished feature generation for depth 2020-05-02 15:12:07 +02:00			`def depth_to_features(entry: list, windows: list, bin_size: float):`
			`"""Convert one record of market depth to a dict of features"""`

			`bids = entry.get("bids")`
			`asks = entry.get("asks")`

			`timestamp = entry.get("timestamp")`

			`# Gap feature`
			`gap = asks[0][0] - bids[0][0]`

			`if gap < 0: gap = 0`

			`# Price feature`
			`price = bids[0][0] + (gap / 2)`

			`# Densities for bids and asks (volume per price unit)`
			`densities = mean_volumes(depth=entry, windows=windows, bin_size=bin_size)`

			`record = {"timestamp": timestamp, "gap": gap, "price": price}`
			`record.update(densities)`

			`return record`

initial commit 2020-02-23 20:45:50 +01:00
			`if __name__ == "__main__":`
			`pass`