2022-03-20 13:58:58 +01:00
from pathlib import Path
import pandas as pd
2022-07-16 11:24:04 +02:00
import click
2022-03-20 13:58:58 +01:00
from service . App import *
2022-08-27 13:05:49 +02:00
from scripts . features import generate_feature_set
2022-07-16 11:24:04 +02:00
from common . label_generation_highlow import *
from common . label_generation_topbot import *
2022-03-20 13:58:58 +01:00
"""
This script will load a feature file ( or any file with close price ) , and add
top - bot columns according to the label parameter , by finally storing both input
data and the labels in the output file ( can be the same file as input ) .
"""
#
# Parameters
#
class P :
2022-03-25 22:49:33 +01:00
in_nrows = 100_000_000
2023-03-26 15:47:30 +02:00
tail_rows = int ( 3.0 * 525_600 ) # Process only this number of last rows
2022-03-25 22:49:33 +01:00
2022-03-20 13:58:58 +01:00
@click.command ( )
@click.option ( ' --config_file ' , ' -c ' , type = click . Path ( ) , default = ' ' , help = ' Configuration file name ' )
def main ( config_file ) :
"""
Load a file with close price ( typically feature matrix ) ,
compute top - bottom labels , add them to the data , and store to output file .
"""
load_config ( config_file )
2022-07-16 11:24:04 +02:00
time_column = App . config [ " time_column " ]
2022-04-15 21:45:46 +02:00
2022-07-16 11:24:04 +02:00
now = datetime . now ( )
2022-03-20 13:58:58 +01:00
#
2022-07-16 11:24:04 +02:00
# Load merged data with regular time series
2022-03-20 13:58:58 +01:00
#
2022-07-16 11:24:04 +02:00
symbol = App . config [ " symbol " ]
data_path = Path ( App . config [ " data_folder " ] ) / symbol
2022-03-20 13:58:58 +01:00
2022-07-17 10:04:20 +02:00
file_path = ( data_path / App . config . get ( " feature_file_name " ) ) . with_suffix ( " .csv " )
2022-07-16 11:24:04 +02:00
if not file_path . is_file ( ) :
print ( f " Data file does not exist: { file_path } " )
return
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
print ( f " Loading data from source data file { file_path } ... " )
df = pd . read_csv ( file_path , parse_dates = [ time_column ] , nrows = P . in_nrows )
print ( f " Finished loading { len ( df ) } records with { len ( df . columns ) } columns. " )
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
df = df . iloc [ - P . tail_rows : ]
df = df . reset_index ( drop = True )
2022-03-25 22:49:33 +01:00
2022-03-20 13:58:58 +01:00
#
2022-07-16 11:24:04 +02:00
# Generate derived features
2022-03-20 13:58:58 +01:00
#
2022-07-16 11:24:04 +02:00
label_sets = App . config . get ( " label_sets " , [ ] )
if not label_sets :
print ( f " ERROR: no label sets defined. Nothing to process. " )
return
# By default, we generate standard labels
#label_sets = [{"column_prefix": "", "generator": "highlow", "feature_prefix": ""}]
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
# Apply all feature generators to the data frame which get accordingly new derived columns
# The feature parameters will be taken from App.config (depending on generator)
print ( f " Start generating labels for { len ( df ) } input records. " )
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
all_features = [ ]
for fs in label_sets :
df , new_features = generate_feature_set ( df , fs , last_rows = 0 )
all_features . extend ( new_features )
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
print ( f " Finished generating labels. " )
2022-03-20 13:58:58 +01:00
2022-07-23 09:12:34 +02:00
print ( f " Number of NULL values: " )
print ( df [ all_features ] . isnull ( ) . sum ( ) . sort_values ( ascending = False ) )
2022-03-20 13:58:58 +01:00
#
2022-07-16 11:24:04 +02:00
# Store feature matrix in output file
2022-03-20 13:58:58 +01:00
#
2022-07-17 10:04:20 +02:00
out_file_name = App . config . get ( " matrix_file_name " )
2022-07-16 11:24:04 +02:00
out_path = ( data_path / out_file_name ) . with_suffix ( " .csv " ) . resolve ( )
print ( f " Storing file with labels. { len ( df ) } records and { len ( df . columns ) } columns in output file... " )
df . to_csv ( out_path , index = False , float_format = " %.4f " )
2022-03-25 22:49:33 +01:00
#
# Store labels
#
2022-07-16 11:24:04 +02:00
with open ( out_path . with_suffix ( ' .txt ' ) , " a+ " ) as f :
2022-07-17 21:32:45 +02:00
f . write ( " , " . join ( [ f ' " { f } " ' for f in all_features ] ) + " \n \n " )
2022-03-25 22:49:33 +01:00
2022-07-16 11:24:04 +02:00
print ( f " Stored { len ( all_features ) } labels in output file { out_path } " )
2022-03-20 13:58:58 +01:00
2022-07-16 11:24:04 +02:00
elapsed = datetime . now ( ) - now
2023-02-19 20:50:25 +01:00
print ( f " Finished generating { len ( all_features ) } labels in { str ( elapsed ) . split ( ' . ' ) [ 0 ] } . Time per label: { str ( elapsed / len ( all_features ) ) . split ( ' . ' ) [ 0 ] } " )
2022-03-20 13:58:58 +01:00
2022-03-25 22:49:33 +01:00
print ( f " Output file location: { out_path } " )
2022-03-20 13:58:58 +01:00
if __name__ == ' __main__ ' :
main ( )