intelligent-trading-bot/common/gen_labels_highlow.py
2026-04-21 12:21:32 +02:00

162 lines
7.6 KiB
Python

import os
from datetime import datetime, timezone, timedelta
from typing import Union
import json
import numpy as np
import pandas as pd
from common.utils import *
from common.gen_features import *
from common.gen_features_rolling_agg import *
"""
Label generation. Labels are features which are used for training.
In forecasting, they are typically computed from future values as
opposed to normal features computed from past values.
"""
def generate_labels_highlow(df, horizon):
"""
Generate (compute) a number of labels similar to other derived features but using future data.
This function is used before training to generate true labels.
We use the following conventions and dimensions for generating binary labels:
- Threshold is used to compare all values of some parameter, for example, 0.5 or 2.0 (sign is determined from the context)
- Greater or less than the threshold. Note that the threshold has a sign which however is determined from the context
- High or low column to compare with the threshold. Note that relative deviations from the close are used.
Hence, high is always positive and low is always negative.
- horizon which determines the future window used to compute all or one
Thus, a general label is computed via the condition: [all or one] [relative high or low] [>= or <=] threshold
However, we do not need all combinations of parameters but rather only some of them which are grouped as follows:
- high >= large_threshold - at least one higher than threshold: 0.5, 1.0, 1.5, 2.0, 2.5
- high <= small_threshold - all lower than threshold: 0.1, 0.2, 0.3, 0.4
- low >= -small_threshold - all higher than threshold: 0.1, 0.2, 0.3, 0.4
- low <= -large_threshold - at least one lower than (negative) threshold: 0.5, 1.0, 1.5, 2.0, 2.5
Accordingly, we encode the labels as follows (60 is horizon):
- high_xx (xx is threshold): for big xx - high_xx means one is larger, for small xx - all are less
- low_xx (xx is threshold): for big xx - low_xx means one is larger, for small xx - all are less
"""
labels = []
windows = [horizon]
# Max high for horizon relative to close (normally positive but can be negative)
labels += add_future_aggregations(df, "high", np.max, windows=windows, suffix='_max', rel_column_name="close", rel_factor=100.0)
high_column_name = "high_max_"+str(horizon) # Example: high_max_180
# Max high crosses (is over) the threshold
labels += add_threshold_feature(df, high_column_name, thresholds=[1.0, 1.5, 2.0, 2.5, 3.0], out_names=["high_10", "high_15", "high_20", "high_25", "high_30"])
# Max high does not cross (is under) the threshold
labels += add_threshold_feature(df, high_column_name, thresholds=[0.1, 0.2, 0.3, 0.4, 0.5], out_names=["high_01", "high_02", "high_03", "high_04", "high_05"])
# Min low for horizon relative to close (normally negative but can be positive)
labels += add_future_aggregations(df, "low", np.min, windows=windows, suffix='_min', rel_column_name="close", rel_factor=100.0)
low_column_name = "low_min_"+str(horizon) # Example: low_min_180
# Min low does not cross (is over) the negative threshold
labels += add_threshold_feature(df, low_column_name, thresholds=[-0.1, -0.2, -0.3, -0.4, -0.5], out_names=["low_01", "low_02", "low_03", "low_04", "low_05"])
# Min low crosses (is under) the negative threshold
labels += add_threshold_feature(df, low_column_name, thresholds=[-1.0, -1.5, -2.0, -2.5, -3.0], out_names=["low_10", "low_15", "low_20", "low_25", "low_30"])
#
# Ratio high_to_low_window
#
# Set negative to 0
df[high_column_name] = df[high_column_name].clip(lower=0)
# Set positive to 0
df[low_column_name] = df[low_column_name].clip(upper=0)
df[low_column_name] = df[low_column_name] * -1
# Ratio between max high and min low in [-1,+1]. +1 means min is 0. -1 means high is 0
column_sum = df[high_column_name] + df[low_column_name]
ratio_column_name = "high_to_low_"+str(horizon)
ratio_column = df[high_column_name] / column_sum # in [0,1]
df[ratio_column_name] = (ratio_column * 2) - 1
return labels
def generate_labels_highlow2(df, config: dict):
"""
Generate multiple increase/decrease labels which are typically used for training.
:param df:
:param config:
:return:
"""
column_names = config.get('columns')
close_column = column_names[0]
high_column = column_names[1]
low_column = column_names[2]
function = config.get('function')
if not isinstance(function, str):
raise ValueError(f"Wrong type of the 'function' parameter: {type(function)}")
if function not in ['high', 'low']:
raise ValueError(f"Unknown function name {function}. Only 'high' or 'low' are possible")
tolerance = config.get('tolerance') # Fraction of the level/threshold
thresholds = config.get('thresholds') # List of thresholds which are growth/drop in percent
if not isinstance(thresholds, list):
thresholds = [thresholds]
if function == 'high':
thresholds = [abs(t) for t in thresholds]
price_columns = [high_column, low_column]
elif function == 'low':
thresholds = [-abs(t) for t in thresholds]
price_columns = [low_column, high_column]
tolerances = [round(-t*tolerance, 6) for t in thresholds] # Tolerance has the opposite sign
horizon = config.get('horizon') # Length of history to be analyzed
names = config.get('names') # For example, ['first_high_10', 'first_high_15'] for two tolerances
if len(names) != len(thresholds):
raise ValueError(f"'highlow2' Label generator: for each threshold value one name has to be provided.")
labels = []
for i, threshold in enumerate(thresholds):
first_cross_labels(df, horizon, [threshold, tolerances[i]], close_column, price_columns, names[i])
labels.append(names[i])
print(f"Highlow2 labels computed: {labels}")
return df, labels
def first_cross_labels(df, horizon, thresholds, close_column, price_columns, out_column):
"""
Produce one boolean column which is true if the price crosses the first threshold
but does not cross the second threshold in the opposite direction before that.
For example, if columns are (high, low) and thresholds are [5.0, -1.0]
then the result is true if price increases by 5% but never decreases lower than 1% during this growth.
If columns are (low, high) and thresholds are [-5.0, 1.0]
the result is true if price decreases by 5% but never increases higher than 1% before that.
"""
# High label - find first (forward) index like +5 of the value exceeds the threshold. Or 0/nan if not found within window
df["first_idx_column"] = first_location_of_crossing_threshold(df, horizon, thresholds[0], close_column, price_columns[0])
# Low label - find first (forward) index like +6 of the value lower than threshold. Or 0/nan if not found within window
df["second_idx_column"] = first_location_of_crossing_threshold(df, horizon, thresholds[1], close_column, price_columns[1])
# The final value is chosen from these two whichever is smaller (as absolute value), that is, closer to this point
def is_high_true(x):
if np.isnan(x[0]):
return False
elif np.isnan(x[1]):
return True
else:
return x[0] <= x[1] # If the first cross point is closer to this point than the second one
df[out_column] = df[["first_idx_column", "second_idx_column"]].apply(is_high_true, raw=True, axis=1)
# Indexes are not needed anymore
df.drop(columns=['first_idx_column', 'second_idx_column'], inplace=True)
return out_column
if __name__ == "__main__":
pass