intelligent-trading-bot/scripts/confidence_mining.py

from pathlib import Path

import itertools

import numpy as np
import pandas as pd
from scipy.stats import norm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

from service.App import App
from common.signal_generation import *


#
# Parameters
#
class P:
    feature_sets = ["kline", ]  # "futur"

    labels = App.config["labels"]
    features_kline = App.config["features_kline"]
    features_futur = App.config["features_futur"]
    features_depth = App.config["features_depth"]

    in_path_name = r"C:\DATA2\BITCOIN\GENERATED"
    # in_file_name = r"_BTCUSDT-1m-rolling-predictions-no-weights.csv"
    # in_file_name = r"_BTCUSDT-1m-rolling-predictions-with-weights.csv"
    in_file_name = r"BTCUSDT-1m-features-rolling.csv"
    in_nrows = 1_500_000

    out_path_name = r"_TEMP_FEATURES"
    out_file_name = r"_BTCUSDT-1m-signals"

    simulation_start = 263519  # Good start is 2019-06-01 - after it we have stable movement
    simulation_end = -0  # After 2020-11-01 there is sharp growth which we might want to exclude


def partition_column(sr: pd.Series):
    """Find conditions for partitioning the specified column and return binary columns with true
    values with rows included in one partition.
    """
    # Find mean value
    # Return two boolean series
    pass


def confidence_mining():
    """
    """
    min_rank = 3
    max_rank = 3

    features = P.features_kline  # Input features to be mined
    feature_true = P.labels[0]  # Label (binary)
    feature_score = 'score'  # Prediction score

    #
    # Load data, partition columns,
    #
    in_path = Path(P.in_path_name).joinpath(P.in_file_name)
    in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)

    in_df = generate_score(in_df, P.feature_sets)  # "score" columns is added
    # TODO: Our predictions are scores, therefore generate binary prediction (with some threshold, maybe derived from average score)
    feature_pred = ''  # Prediction (binary)

    # in_df = in_df.iloc[100_000:200_000]

    # Load feature file (because it is not stored in rolling predictions)
    f_df = pd.read_csv(r"C:\DATA2\BITCOIN\GENERATED\BTCUSDT-1m-features.csv", parse_dates=['timestamp'],
                       nrows=500_000_000)
    f_df = f_df[['timestamp'] + P.features_kline]

    df = in_df.merge(f_df, left_on='timestamp', right_on='timestamp')
    pd.set_option('use_inf_as_na', True)
    df = df.dropna(subset=features + [feature_true, feature_score])
    N = len(df)
    print(f"Size of the data set {N}. Features: {len(features)}")

    auc_mean = roc_auc_score(df[feature_true], df[feature_score])
    print(f"Mean AUC: {auc_mean}")

    #
    # Partition individual features
    #
    feature_partitions = dict()  # Key is feature, value is a list of binary maps
    for f in features:
        sr = df[f]
        f_mean = np.nanmean(sr)
        f_std = np.nanstd(sr)

        p1 = norm.ppf(0.3333333333, loc=f_mean, scale=f_std)
        p2 = norm.ppf(0.6666666666, loc=f_mean, scale=f_std)

        p1, p2 = list(np.quantile(sr, [0.3333333333, 0.6666666666]))

        # Two partitions
        #f_0 = (sr < f_mean)
        #f_1 = (sr >= f_mean)

        # Three partitions
        f_0 = (sr <= p1)
        f_1 = ((p1 < sr) & (sr < p2))
        f_2 = (sr >= p2)

        feature_partitions[f] = [f_0, f_1, f_2]

    #
    # For each combination of variable values, find the subset (using AND)
    #
    start_dt = datetime.now()

    scores = list()
    for r in range(min_rank, max_rank + 1):
        feature_combinations = list(itertools.combinations(features, r))
        print(f"Rank {r}. Feature combinations: {len(feature_combinations)}")
        itemsets = list(itertools.product([0, 1, 2], repeat=r))
        for fset_no, fset in enumerate(feature_combinations):
            if fset_no % 100 == 0:
                print(f"Feature set no: {fset_no}/{len(feature_combinations)}")

            # For this itemset, find all combinations of partitions
            for iset in itemsets:
                subset = np.ones(N, dtype=bool)  # All 1s
                for i in range(r):
                    f = fset[i]
                    if iset[i] == 0:
                        np.logical_and(subset, feature_partitions[f][0], out=subset)
                    elif iset[i] == 1:
                        np.logical_and(subset, feature_partitions[f][1], out=subset)
                    elif iset[i] == 2:
                        np.logical_and(subset, feature_partitions[f][2], out=subset)
                    else:
                        raise ValueError()
                # Select data using the subset
                subset_df = df[subset]

                auc = 0.0
                try:
                    auc = roc_auc_score(subset_df[feature_true], subset_df[feature_score])
                except Exception as ve:
                    pass

                precision = 0.0
                try:
                    precision = average_precision_score(subset_df[feature_true], subset_df[feature_score])
                except Exception as e:
                    pass
                scores.append([auc, precision, fset, iset, len(subset_df)])

                # Compute metrics: false positives, false negatives
                # tn, fp, fn, tp = confusion_matrix(subset_df[], subset_df[]).ravel()

                # high precision -> low false positive rate, high recall -> low false negative rate
                # We can use: average precision, average recall etc.

                # For the subset, compute metrics of labels: false negative, false positives

    elapsed = datetime.now() - start_dt
    print(f"Finished feature prediction in {str(elapsed)}.")

    # Store them and report intervals (combinations) with minimum false negative/positives
    scores = sorted(scores, key=lambda x: x[0], reverse=True)
    print(scores[:20])
    textfile = open("confidence_mining_by_auc.txt", "w")
    for element in scores[:100]:
        textfile.write(str(element) + "\n")
    textfile.close()

    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    print(scores[:20])
    textfile = open("confidence_mining_by_precision.txt", "w")
    for element in scores[:100]:
        textfile.write(str(element) + "\n")
    textfile.close()

    pass


if __name__ == '__main__':
    confidence_mining()
intermediate results 2021-08-29 12:00:12 +02:00			`from pathlib import Path`

			`import itertools`

			`import numpy as np`
			`import pandas as pd`
			`from scipy.stats import norm`

			`from sklearn.metrics import confusion_matrix`
			`from sklearn.metrics import roc_auc_score`
			`from sklearn.metrics import average_precision_score`

restructure 2021-09-04 18:19:04 +02:00			`from service.App import App`
intermediate results 2021-08-29 12:00:12 +02:00			`from common.signal_generation import *`


			`#`
			`# Parameters`
			`#`
			`class P:`
			`feature_sets = ["kline", ] # "futur"`

			`labels = App.config["labels"]`
			`features_kline = App.config["features_kline"]`
			`features_futur = App.config["features_futur"]`
			`features_depth = App.config["features_depth"]`

			`in_path_name = r"C:\DATA2\BITCOIN\GENERATED"`
			`# in_file_name = r"_BTCUSDT-1m-rolling-predictions-no-weights.csv"`
			`# in_file_name = r"_BTCUSDT-1m-rolling-predictions-with-weights.csv"`
			`in_file_name = r"BTCUSDT-1m-features-rolling.csv"`
			`in_nrows = 1_500_000`

			`out_path_name = r"_TEMP_FEATURES"`
			`out_file_name = r"_BTCUSDT-1m-signals"`

			`simulation_start = 263519 # Good start is 2019-06-01 - after it we have stable movement`
			`simulation_end = -0 # After 2020-11-01 there is sharp growth which we might want to exclude`


			`def partition_column(sr: pd.Series):`
			`"""Find conditions for partitioning the specified column and return binary columns with true`
			`values with rows included in one partition.`
			`"""`
			`# Find mean value`
			`# Return two boolean series`
			`pass`


			`def confidence_mining():`
			`"""`
			`"""`
			`min_rank = 3`
			`max_rank = 3`

			`features = P.features_kline # Input features to be mined`
			`feature_true = P.labels[0] # Label (binary)`
			`feature_score = 'score' # Prediction score`

			`#`
			`# Load data, partition columns,`
			`#`
			`in_path = Path(P.in_path_name).joinpath(P.in_file_name)`
			`in_df = pd.read_csv(in_path, parse_dates=['timestamp'], nrows=P.in_nrows)`

			`in_df = generate_score(in_df, P.feature_sets) # "score" columns is added`
			`# TODO: Our predictions are scores, therefore generate binary prediction (with some threshold, maybe derived from average score)`
			`feature_pred = '' # Prediction (binary)`

			`# in_df = in_df.iloc[100_000:200_000]`

			`# Load feature file (because it is not stored in rolling predictions)`
			`f_df = pd.read_csv(r"C:\DATA2\BITCOIN\GENERATED\BTCUSDT-1m-features.csv", parse_dates=['timestamp'],`
			`nrows=500_000_000)`
			`f_df = f_df[['timestamp'] + P.features_kline]`

			`df = in_df.merge(f_df, left_on='timestamp', right_on='timestamp')`
			`pd.set_option('use_inf_as_na', True)`
			`df = df.dropna(subset=features + [feature_true, feature_score])`
			`N = len(df)`
			`print(f"Size of the data set {N}. Features: {len(features)}")`

			`auc_mean = roc_auc_score(df[feature_true], df[feature_score])`
			`print(f"Mean AUC: {auc_mean}")`

			`#`
			`# Partition individual features`
			`#`
			`feature_partitions = dict() # Key is feature, value is a list of binary maps`
			`for f in features:`
			`sr = df[f]`
			`f_mean = np.nanmean(sr)`
			`f_std = np.nanstd(sr)`

			`p1 = norm.ppf(0.3333333333, loc=f_mean, scale=f_std)`
			`p2 = norm.ppf(0.6666666666, loc=f_mean, scale=f_std)`

			`p1, p2 = list(np.quantile(sr, [0.3333333333, 0.6666666666]))`

			`# Two partitions`
			`#f_0 = (sr < f_mean)`
			`#f_1 = (sr >= f_mean)`

			`# Three partitions`
			`f_0 = (sr <= p1)`
			`f_1 = ((p1 < sr) & (sr < p2))`
			`f_2 = (sr >= p2)`

			`feature_partitions[f] = [f_0, f_1, f_2]`

			`#`
			`# For each combination of variable values, find the subset (using AND)`
			`#`
			`start_dt = datetime.now()`

			`scores = list()`
			`for r in range(min_rank, max_rank + 1):`
			`feature_combinations = list(itertools.combinations(features, r))`
			`print(f"Rank {r}. Feature combinations: {len(feature_combinations)}")`
			`itemsets = list(itertools.product([0, 1, 2], repeat=r))`
			`for fset_no, fset in enumerate(feature_combinations):`
			`if fset_no % 100 == 0:`
			`print(f"Feature set no: {fset_no}/{len(feature_combinations)}")`

			`# For this itemset, find all combinations of partitions`
			`for iset in itemsets:`
			`subset = np.ones(N, dtype=bool) # All 1s`
			`for i in range(r):`
			`f = fset[i]`
			`if iset[i] == 0:`
			`np.logical_and(subset, feature_partitions[f][0], out=subset)`
			`elif iset[i] == 1:`
			`np.logical_and(subset, feature_partitions[f][1], out=subset)`
			`elif iset[i] == 2:`
			`np.logical_and(subset, feature_partitions[f][2], out=subset)`
			`else:`
			`raise ValueError()`
			`# Select data using the subset`
			`subset_df = df[subset]`

			`auc = 0.0`
			`try:`
			`auc = roc_auc_score(subset_df[feature_true], subset_df[feature_score])`
			`except Exception as ve:`
			`pass`

			`precision = 0.0`
			`try:`
			`precision = average_precision_score(subset_df[feature_true], subset_df[feature_score])`
			`except Exception as e:`
			`pass`
			`scores.append([auc, precision, fset, iset, len(subset_df)])`

			`# Compute metrics: false positives, false negatives`
			`# tn, fp, fn, tp = confusion_matrix(subset_df[], subset_df[]).ravel()`

			`# high precision -> low false positive rate, high recall -> low false negative rate`
			`# We can use: average precision, average recall etc.`

			`# For the subset, compute metrics of labels: false negative, false positives`

			`elapsed = datetime.now() - start_dt`
			`print(f"Finished feature prediction in {str(elapsed)}.")`

			`# Store them and report intervals (combinations) with minimum false negative/positives`
			`scores = sorted(scores, key=lambda x: x[0], reverse=True)`
			`print(scores[:20])`
			`textfile = open("confidence_mining_by_auc.txt", "w")`
			`for element in scores[:100]:`
			`textfile.write(str(element) + "\n")`
			`textfile.close()`

			`scores = sorted(scores, key=lambda x: x[1], reverse=True)`
			`print(scores[:20])`
			`textfile = open("confidence_mining_by_precision.txt", "w")`
			`for element in scores[:100]:`
			`textfile.write(str(element) + "\n")`
			`textfile.close()`

			`pass`


			`if __name__ == '__main__':`
			`confidence_mining()`