import sys
import os
from datetime import datetime, timezone, timedelta
from typing import Union
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from service.App import App
from common.utils import *
from common.classifiers import *

"""
Find good hyper-parameters of algorithms applied to generated featues.
"""

data_path = r"C:\DATA2\BITCOIN\GENERATED"
data_file = r"BTCUSDT-1m-features.csv"

labels = App.config["labels"]
features_kline = App.config["features_kline"]
features_futur = App.config["features_futur"]

# features_horizon = 720  # Features are generated using this past window length (max feature window)
labels_horizon = 180  # Labels are generated using this number of steps ahead (max label window)

#
# Parameters of rolling predict
#

nrows = 10_000_000  # For debug
# Columns
train_features = features_kline  # features_futur features_kline
predict_label = "high_15"
# Rows
prediction_start_str = "2020-09-01 00:00:00"  # Use it when rolling prediction will work (2020-02-01 00:00:00 - for futur)
#prediction_start_str = "2020-06-01 00:00:00"
train_length = int(1.5 * 525_600)  # 1.5 * 525_600 for long/spot, 4 * 43_800 for short/futur
stride = 4*7*1440  # Length of one rolling prediction step: mid: 1 month 43_800=4*7*1440, long: 1,5 months 6*7*1440
steps = 2  # How many rolling prediction steps. ~40 weeks in [1.2-1.11]

algorithm = "nn"  # gb nn lc

#
# Parameters for algorithms
#

params_grid_gb = {  # First parameter is the slowest
    # binary (logloss - logistic regression) cross_entropy cross_entropy_lambda
    "objective": ["cross_entropy"],  # "cross_entropy", "cross_entropy_lambda", "binary"
    "max_depth": [1],
    "learning_rate": [0.01],
    "num_boost_round": [1_500],

    "lambda_l1": [1.0], # (reg_alpha) 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 1.5
    "lambda_l2": [1.0],  # (reg_lambda), [0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
}
def params_to_line_gb(params):
    line = [
        params.get("objective"),
        params.get("max_depth"),
        params.get("learning_rate"),
        params.get("num_boost_round"),
        params.get("lambda_l1"),
        params.get("lambda_l2"),
    ]
    return line

# TODO: Implement and run GB with scaling (by default it uses NO scaling)
# 1-0.01-1500
# Futur
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.625, 0.167, 0.602, 0.097
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.641, 0.121, 0.609, 0.067
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.652, 0.066, 0.524, 0.035
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.632, 0.131, 0.684, 0.073
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.649, 0.075, 0.634, 0.040
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.653, 0.074, 0.661, 0.039
# klines
#high_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.680, 0.171, 0.695, 0.098
#high_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.720, 0.075, 0.622, 0.040
#high_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.745, 0.001, 0.577, 0.001
#low_10, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.688, 0.200, 0.642, 0.118
#low_15, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.718, 0.072, 0.620, 0.038
#low_20, cross_entropy, 1, 0.01, 1500, 1.0, 1.0, 0.734, 0.011, 0.506, 0.006

params_grid_nn = {  # First parameter is the slowest
    "layers": [[33]],  # Number of layers depends on the number of input features
    "learning_rate": [0.001],
    "n_epochs": [20],
    "bs": [64],
}
def params_to_line_nn(params):
    line = [
        params.get("layers"),
        params.get("learning_rate"),
        params.get("n_epochs"),
        params.get("bs"),
    ]
    return line

# TODO: Implement and run NN without scaling (by default it usese scaling)
# Futur
#high_10, [33], 0.001, 20, 64, 0.626, 0.171, 0.606, 0.099
#high_15, [33], 0.001, 20, 64, 0.623, 0.118, 0.621, 0.065
#high_20, [33], 0.001, 20, 64, 0.646, 0.061, 0.507, 0.033
#low_10, [33], 0.001, 20, 64, 0.634, 0.179, 0.625, 0.104
#low_15, [33], 0.001, 20, 64, 0.623, 0.106, 0.665, 0.058
#low_20, [33], 0.001, 20, 64, 0.632, 0.058, 0.588, 0.031
# Spot/klines
#high_10, [46], 0.001, 20, 64, 0.675, 0.269, 0.653, 0.169
#high_15, [46], 0.001, 20, 64, 0.707, 0.207, 0.674, 0.122
#high_20, [46], 0.001, 20, 64, 0.717, 0.123, 0.610, 0.068
#low_10, [46], 0.001, 20, 64, 0.696, 0.283, 0.646, 0.181
#low_15, [46], 0.001, 20, 64, 0.713, 0.190, 0.628, 0.112
#low_20, [46], 0.001, 20, 64, 0.729, 0.150, 0.617, 0.085

# Best liblinear: is_scale=False, balance=False, penalty=l2, max_iter=100
params_grid_lc = {  # First parameter is the slowest
    # Best is False, but True can give almost same result (under different conditions)
    "is_scale": [False],
    "penalty": ["l2"],  # "l2" "l1" "elasticnet" "none"
    "C": [1.0],  # small values stronger regularization
    # 1) None balance is always better
    "class_weight": [None],  # "balanced"
    # 1) liblinear - fast convergence (100 is enough), lbfgs (l2 or none) - good convergence, "newton-cg" "sag" "saga"
    "solver": ["liblinear"],
    "max_iter": [200],
}
def params_to_line_lc(params):
    line = [
        params.get("is_scale"),
        params.get("penalty"),
        params.get("C"),
        params.get("class_weight"),
        params.get("solver"),
        params.get("max_iter"),
    ]
    return line

# Results for futur:
#high_10, False, l2, 1.0, None, liblinear, 100, 0.458, 0.032, 0.553, 0.016
#high_15, False, l2, 1.0, None, liblinear, 100, 0.470, 0.029, 0.652, 0.015
#high_20, False, l2, 1.0, None, liblinear, 100, 0.472, 0.030, 0.623, 0.015
#low_10, False, l2, 1.0, None, liblinear, 100, 0.471, 0.017, 0.367, 0.009
#low_15, False, l2, 1.0, None, liblinear, 100, 0.468, 0.005, 0.399, 0.002
#low_20, False, l2, 1.0, None, liblinear, 100, 0.473, 0.010, 0.572, 0.005
# Results for klines (spot):
#high_10, False, l2, 1.0, None, liblinear, 200, 0.551, 0.050, 0.522, 0.026
#high_15, False, l2, 1.0, None, liblinear, 200, 0.558, 0.023, 0.484, 0.012
#high_20, False, l2, 1.0, None, liblinear, 200, 0.564, 0.017, 0.440, 0.009
#low_10, False, l2, 1.0, None, liblinear, 200, 0.565, 0.044, 0.538, 0.023
#low_15, False, l2, 1.0, None, liblinear, 200, 0.580, 0.015, 0.556, 0.008
#low_20, False, l2, 1.0, None, liblinear, 200, 0.584, 0.019, 0.784, 0.010


#
# Grid search
#

def main():
    #
    # Load and prepare all data
    #

    # Load all data
    df_all = pd.read_csv(data_path + "\\" + data_file, parse_dates=['timestamp'], date_format="ISO8601", nrows=nrows)

    print(f"Feature matrix loaded. Length: {len(df_all)}. Width: {len(df_all.columns)}")

    for label in labels:
        df_all[label] = df_all[label].astype(int)  # "category" NN does not work without this

    # Select necessary features and label
    df_all = df_all[["timestamp"] + features_kline + features_futur + labels]

    # Spot and futures have different available histories. If we drop nans in all of them, then we get a very short data frame (corresponding to futureus which have little data)
    # So we do not drop data here but rather when we select necessary input features
    # Nans result in constant accuracy and nan loss. MissingValues procedure does not work and produces exceptions
    pd.set_option('use_inf_as_na', True)
    #df_all = df_all.dropna(subset=labels)
    df_all = df_all.reset_index(drop=True)  # We must reset index after removing rows to remove gaps

    prediction_start = find_index(df_all, prediction_start_str)
    print(f"Start index: {prediction_start}")

    if len(df_all) - prediction_start < steps * stride:
        raise ValueError(f"Number of steps {steps} is too high (not enough data after start). Data available for prediction: {len(df_all) - prediction_start}. Data to be predicted: {steps * stride} ")

    del df_all["timestamp"]

    #
    # Prepare params and train by getting precision
    #
    if algorithm == "gb":
        params_grid = params_grid_gb
    elif algorithm == "nn":
        params_grid = params_grid_nn
    elif algorithm == "lc":
        params_grid = params_grid_lc
    else:
        raise ValueError(f"Unknown algorithm value {algorithm}.")

    metrics = []  # One record with metrics for one param object
    grid = ParameterGrid(params_grid)
    params_list = list(grid)  # List of hyper-param dicts
    for i, params in enumerate(params_list):

        print("\n{}/{} rolling train start...".format(i+1, len(params_list)))

        # Here we will collect true and predicted values for one label
        # These series must have the same indexes and these indexes should correspond to main input indexes even if some rows are dropped (for them we store Null)
        y_true = pd.Series(dtype=float)
        y_predicted = pd.Series(dtype=float)

        for step in range(steps):

            print(f"\nStart step {step}/{steps}")

            # Predict data

            predict_start = prediction_start + (step * stride)
            predict_end = predict_start + stride

            df_test = df_all.iloc[predict_start:predict_end]
            #df_test = df_test.dropna(subset=train_features)  # Nans will be droped by the algorithms themselves

            df_X_test = df_test[train_features]
            df_y_test = df_test[predict_label]

            # Train data

            # We exclude recent objects from training, because they do not have labels yet - the labels are in future
            # In real (stream) data, we will have null labels for recent objects. During simulation, labels are available and hence we need to ignore/exclude them manually
            train_end = predict_start - labels_horizon - 1
            train_start = train_end - train_length
            train_start = 0 if train_start < 0 else train_start

            df_train = df_all.iloc[int(train_start):int(train_end)]
            df_train = df_train.dropna(subset=train_features)

            df_X = df_train[train_features]
            df_y = df_train[predict_label]

            print(f"Train range: [{train_start}, {train_end}]={train_end-train_start}. Prediction range: [{predict_start}, {predict_end}]={predict_end-predict_start}. ")

            # ---
            if algorithm == "gb":
                y_test_hat = train_predict_gb(df_X, df_y, df_X_test, params)
            elif algorithm == "nn":
                y_test_hat = train_predict_nn(df_X, df_y, df_X_test, params)
            elif algorithm == "lc":
                y_test_hat = train_predict_lc(df_X, df_y, df_X_test, params)
            # ---

            # Append true and predicted array
            y_true = y_true.append(df_y_test)
            y_predicted = y_predicted.append(y_test_hat)

            print(f"End step {step}/{steps}. ")

        print("")
        print("Finished {} steps of train with {} true and {} predicted results.".format(steps, len(y_true), len(y_predicted)))

        # y_true and y_predicted might have nans which can confuse some scoring functions
        df_scores = pd.DataFrame({"y_true": y_true, "y_predicted": y_predicted})
        num_scores = len(df_scores)
        df_scores = df_scores.dropna()
        print(f"Total number of collected predictions: {num_scores}. After dropping NaNs: {len(df_scores)}")
        print(f"Number of non-NaN predictions used for scoring: {len(df_scores)}")
        num_scores = len(df_scores)
        y_true = df_scores["y_true"]
        y_predicted = df_scores["y_predicted"]

        score = compute_scores(y_true, y_predicted)

        metrics.append(score)

    #
    # Process all collected results and save
    #
    lines = []
    for i, params in enumerate(params_list):
        line = [predict_label]
        # Add parameters
        if algorithm == "gb":
            line += params_to_line_gb(params)
        elif algorithm == "nn":
            line += params_to_line_nn(params)
        elif algorithm == "lc":
            line += params_to_line_lc(params)
        # Add scores
        rec = metrics[i]
        score_str = [
            "{:.3f}".format(rec["auc"]),
            "{:.3f}".format(rec["f1"]),
            "{:.3f}".format(rec["precision"]),
            "{:.3f}".format(rec["recall"]),
        ]
        line += score_str

        lines.append(", ".join([str(x) for x in line]))

    with open('metrics.txt', 'a+') as f:
        f.write("\n".join(lines) + "\n")


if __name__ == '__main__':
    main()