intelligent-trading-bot/scripts/confidence_mining.py

Name: MQL5 Algo Forge
Brand: MQL5
from pathlib import Path

import itertools

import numpy as np
import pandas as pd
from scipy.stats import norm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

from service.App import App
from common.signal_generation import *


#
# Parameters
#
class P:
    feature_sets = ["kline", ]  # "futur"

    labels = App.config["labels"]
    features_kline = App.config["features_kline"]
    features_futur = App.config["features_futur"]
    features_depth = App.config["features_depth"]

    in_path_name = r"C:\DATA2\BITCOIN\GENERATED"
    # in_file_name = r"_BTCUSDT-1m-rolling-predictions-no-weights.csv"
    # in_file_name = r"_BTCUSDT-1m-rolling-predictions-with-weights.csv"
    in_file_name = r"BTCUSDT-1m-features-rolling.csv"
    in_nrows = 1_500_000

    out_path_name = r"_TEMP_FEATURES"
    out_file_name = r"_BTCUSDT-1m-signals"

    simulation_start = 263519  # Good start is 2019-06-01 - after it we have stable movement
    simulation_end = -0  # After 2020-11-01 there is sharp growth which we might want to exclude


def partition_column(sr: pd.Series):
    """Find conditions for partitioning the specified column and return binary columns with true
    values with rows included in one partition.
    """
    # Find mean value
    # Return two boolean series
    pass


def confidence_mining():
    """
    """
    min_rank = 3
    max_rank = 3

    features = P.features_kline  # Input features to be mined
    feature_true = P.labels[0]  # Label (binary)
    feature_score = 'score'  # Prediction score

    #
    # Load data, partition columns,
    #
    in_path = Path(P.in_path_name).joinpath(P.in_file_name)
    in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)

    in_df = generate_score(in_df, P.feature_sets)  # "score" columns is added
    # TODO: Our predictions are scores, therefore generate binary prediction (with some threshold, maybe derived from average score)
    feature_pred = ''  # Prediction (binary)

    # in_df = in_df.iloc[100_000:200_000]

    # Load feature file (because it is not stored in rolling predictions)
    f_df = pd.read_csv(r"C:\DATA2\BITCOIN\GENERATED\BTCUSDT-1m-features.csv", parse_dates=['timestamp'],
                       nrows=500_000_000)
    f_df = f_df[['timestamp'] + P.features_kline]

    df = in_df.merge(f_df, left_on='timestamp', right_on='timestamp')
    pd.set_option('use_inf_as_na', True)
    df = df.dropna(subset=features + [feature_true, feature_score])
    N = len(df)
    print(f"Size of the data set {N}. Features: {len(features)}")

    auc_mean = roc_auc_score(df[feature_true], df[feature_score])
    print(f"Mean AUC: {auc_mean}")

    #
    # Partition individual features
    #
    feature_partitions = dict()  # Key is feature, value is a list of binary maps
    for f in features:
        sr = df[f]
        f_mean = np.nanmean(sr)
        f_std = np.nanstd(sr)

        p1 = norm.ppf(0.3333333333, loc=f_mean, scale=f_std)
        p2 = norm.ppf(0.6666666666, loc=f_mean, scale=f_std)

        p1, p2 = list(np.quantile(sr, [0.3333333333, 0.6666666666]))

        # Two partitions
        #f_0 = (sr < f_mean)
        #f_1 = (sr >= f_mean)

        # Three partitions
        f_0 = (sr <= p1)
        f_1 = ((p1 < sr) & (sr < p2))
        f_2 = (sr >= p2)

        feature_partitions[f] = [f_0, f_1, f_2]

    #
    # For each combination of variable values, find the subset (using AND)
    #
    start_dt = datetime.now()

    scores = list()
    for r in range(min_rank, max_rank + 1):
        feature_combinations = list(itertools.combinations(features, r))
        print(f"Rank {r}. Feature combinations: {len(feature_combinations)}")
        itemsets = list(itertools.product([0, 1, 2], repeat=r))
        for fset_no, fset in enumerate(feature_combinations):
            if fset_no % 100 == 0:
                print(f"Feature set no: {fset_no}/{len(feature_combinations)}")

            # For this itemset, find all combinations of partitions
            for iset in itemsets:
                subset = np.ones(N, dtype=bool)  # All 1s
                for i in range(r):
                    f = fset[i]
                    if iset[i] == 0:
                        np.logical_and(subset, feature_partitions[f][0], out=subset)
                    elif iset[i] == 1:
                        np.logical_and(subset, feature_partitions[f][1], out=subset)
                    elif iset[i] == 2:
                        np.logical_and(subset, feature_partitions[f][2], out=subset)
                    else:
                        raise ValueError()
                # Select data using the subset
                subset_df = df[subset]

                auc = 0.0
                try:
                    auc = roc_auc_score(subset_df[feature_true], subset_df[feature_score])
                except Exception as ve:
                    pass

                precision = 0.0
                try:
                    precision = average_precision_score(subset_df[feature_true], subset_df[feature_score])
                except Exception as e:
                    pass
                scores.append([auc, precision, fset, iset, len(subset_df)])

                # Compute metrics: false positives, false negatives
                # tn, fp, fn, tp = confusion_matrix(subset_df[], subset_df[]).ravel()

                # high precision -> low false positive rate, high recall -> low false negative rate
                # We can use: average precision, average recall etc.

                # For the subset, compute metrics of labels: false negative, false positives

    elapsed = datetime.now() - start_dt
    print(f"Finished feature prediction in {str(elapsed)}.")

    # Store them and report intervals (combinations) with minimum false negative/positives
    scores = sorted(scores, key=lambda x: x[0], reverse=True)
    print(scores[:20])
    textfile = open("confidence_mining_by_auc.txt", "w")
    for element in scores[:100]:
        textfile.write(str(element) + "\n")
    textfile.close()

    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    print(scores[:20])
    textfile = open("confidence_mining_by_precision.txt", "w")
    for element in scores[:100]:
        textfile.write(str(element) + "\n")
    textfile.close()

    pass


if __name__ == '__main__':
    confidence_mining()