mirror of
https://github.com/asavinov/intelligent-trading-bot.git
synced 2026-05-04 16:26:44 +00:00
184 lines
6.3 KiB
Python
184 lines
6.3 KiB
Python
from pathlib import Path
|
|
|
|
import itertools
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.stats import norm
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.metrics import roc_auc_score
|
|
from sklearn.metrics import average_precision_score
|
|
|
|
from service.App import App
|
|
from common.signal_generation import *
|
|
|
|
|
|
#
|
|
# Parameters
|
|
#
|
|
class P:
|
|
feature_sets = ["kline", ] # "futur"
|
|
|
|
labels = App.config["labels"]
|
|
features_kline = App.config["features_kline"]
|
|
features_futur = App.config["features_futur"]
|
|
features_depth = App.config["features_depth"]
|
|
|
|
in_path_name = r"C:\DATA2\BITCOIN\GENERATED"
|
|
# in_file_name = r"_BTCUSDT-1m-rolling-predictions-no-weights.csv"
|
|
# in_file_name = r"_BTCUSDT-1m-rolling-predictions-with-weights.csv"
|
|
in_file_name = r"BTCUSDT-1m-features-rolling.csv"
|
|
in_nrows = 1_500_000
|
|
|
|
out_path_name = r"_TEMP_FEATURES"
|
|
out_file_name = r"_BTCUSDT-1m-signals"
|
|
|
|
simulation_start = 263519 # Good start is 2019-06-01 - after it we have stable movement
|
|
simulation_end = -0 # After 2020-11-01 there is sharp growth which we might want to exclude
|
|
|
|
|
|
def partition_column(sr: pd.Series):
|
|
"""Find conditions for partitioning the specified column and return binary columns with true
|
|
values with rows included in one partition.
|
|
"""
|
|
# Find mean value
|
|
# Return two boolean series
|
|
pass
|
|
|
|
|
|
def confidence_mining():
|
|
"""
|
|
"""
|
|
min_rank = 3
|
|
max_rank = 3
|
|
|
|
features = P.features_kline # Input features to be mined
|
|
feature_true = P.labels[0] # Label (binary)
|
|
feature_score = 'score' # Prediction score
|
|
|
|
#
|
|
# Load data, partition columns,
|
|
#
|
|
in_path = Path(P.in_path_name).joinpath(P.in_file_name)
|
|
in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)
|
|
|
|
in_df = generate_score(in_df, P.feature_sets) # "score" columns is added
|
|
# TODO: Our predictions are scores, therefore generate binary prediction (with some threshold, maybe derived from average score)
|
|
feature_pred = '' # Prediction (binary)
|
|
|
|
# in_df = in_df.iloc[100_000:200_000]
|
|
|
|
# Load feature file (because it is not stored in rolling predictions)
|
|
f_df = pd.read_csv(r"C:\DATA2\BITCOIN\GENERATED\BTCUSDT-1m-features.csv", parse_dates=['timestamp'],
|
|
nrows=500_000_000)
|
|
f_df = f_df[['timestamp'] + P.features_kline]
|
|
|
|
df = in_df.merge(f_df, left_on='timestamp', right_on='timestamp')
|
|
pd.set_option('use_inf_as_na', True)
|
|
df = df.dropna(subset=features + [feature_true, feature_score])
|
|
N = len(df)
|
|
print(f"Size of the data set {N}. Features: {len(features)}")
|
|
|
|
auc_mean = roc_auc_score(df[feature_true], df[feature_score])
|
|
print(f"Mean AUC: {auc_mean}")
|
|
|
|
#
|
|
# Partition individual features
|
|
#
|
|
feature_partitions = dict() # Key is feature, value is a list of binary maps
|
|
for f in features:
|
|
sr = df[f]
|
|
f_mean = np.nanmean(sr)
|
|
f_std = np.nanstd(sr)
|
|
|
|
p1 = norm.ppf(0.3333333333, loc=f_mean, scale=f_std)
|
|
p2 = norm.ppf(0.6666666666, loc=f_mean, scale=f_std)
|
|
|
|
p1, p2 = list(np.quantile(sr, [0.3333333333, 0.6666666666]))
|
|
|
|
# Two partitions
|
|
#f_0 = (sr < f_mean)
|
|
#f_1 = (sr >= f_mean)
|
|
|
|
# Three partitions
|
|
f_0 = (sr <= p1)
|
|
f_1 = ((p1 < sr) & (sr < p2))
|
|
f_2 = (sr >= p2)
|
|
|
|
feature_partitions[f] = [f_0, f_1, f_2]
|
|
|
|
#
|
|
# For each combination of variable values, find the subset (using AND)
|
|
#
|
|
start_dt = datetime.now()
|
|
|
|
scores = list()
|
|
for r in range(min_rank, max_rank + 1):
|
|
feature_combinations = list(itertools.combinations(features, r))
|
|
print(f"Rank {r}. Feature combinations: {len(feature_combinations)}")
|
|
itemsets = list(itertools.product([0, 1, 2], repeat=r))
|
|
for fset_no, fset in enumerate(feature_combinations):
|
|
if fset_no % 100 == 0:
|
|
print(f"Feature set no: {fset_no}/{len(feature_combinations)}")
|
|
|
|
# For this itemset, find all combinations of partitions
|
|
for iset in itemsets:
|
|
subset = np.ones(N, dtype=bool) # All 1s
|
|
for i in range(r):
|
|
f = fset[i]
|
|
if iset[i] == 0:
|
|
np.logical_and(subset, feature_partitions[f][0], out=subset)
|
|
elif iset[i] == 1:
|
|
np.logical_and(subset, feature_partitions[f][1], out=subset)
|
|
elif iset[i] == 2:
|
|
np.logical_and(subset, feature_partitions[f][2], out=subset)
|
|
else:
|
|
raise ValueError()
|
|
# Select data using the subset
|
|
subset_df = df[subset]
|
|
|
|
auc = 0.0
|
|
try:
|
|
auc = roc_auc_score(subset_df[feature_true], subset_df[feature_score])
|
|
except Exception as ve:
|
|
pass
|
|
|
|
precision = 0.0
|
|
try:
|
|
precision = average_precision_score(subset_df[feature_true], subset_df[feature_score])
|
|
except Exception as e:
|
|
pass
|
|
scores.append([auc, precision, fset, iset, len(subset_df)])
|
|
|
|
# Compute metrics: false positives, false negatives
|
|
# tn, fp, fn, tp = confusion_matrix(subset_df[], subset_df[]).ravel()
|
|
|
|
# high precision -> low false positive rate, high recall -> low false negative rate
|
|
# We can use: average precision, average recall etc.
|
|
|
|
# For the subset, compute metrics of labels: false negative, false positives
|
|
|
|
elapsed = datetime.now() - start_dt
|
|
print(f"Finished feature prediction in {str(elapsed)}.")
|
|
|
|
# Store them and report intervals (combinations) with minimum false negative/positives
|
|
scores = sorted(scores, key=lambda x: x[0], reverse=True)
|
|
print(scores[:20])
|
|
textfile = open("confidence_mining_by_auc.txt", "w")
|
|
for element in scores[:100]:
|
|
textfile.write(str(element) + "\n")
|
|
textfile.close()
|
|
|
|
scores = sorted(scores, key=lambda x: x[1], reverse=True)
|
|
print(scores[:20])
|
|
textfile = open("confidence_mining_by_precision.txt", "w")
|
|
for element in scores[:100]:
|
|
textfile.write(str(element) + "\n")
|
|
textfile.close()
|
|
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
confidence_mining()
|