Quick Start#

import onetick.py as otp
from onetick import ml
from datetime import datetime


start = otp.dt(2022, 5, 10, 9, 30)
end = otp.dt(2022, 8, 10, 16, 0)

symbols = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'BRKB', 'TSLA', 'XOM', 'UNH', 'FB']

otp.__version__, ml.__version__
('1.81.0', '1.0.6')

Fetch the data for use in feature creation#

We create a class that inherits from ml.BaseOnetickLoader rather than using onetick-py directly in order to be able to save this code as part of the experiment.

class Bars(ml.BaseOnetickLoader):    
    def get_source(self):
        data = otp.DataSource(db=self.schema.db, tick_type=self.schema.tick_type)
        
        data = data.agg(
            {
                'VOLUME': otp.agg.sum(data['VOLUME']),
                'PRICE_STDEV': otp.agg.stddev(data['VWAP'])
            },
            bucket_interval=self.bucket,
            bucket_time='start'
        )
        
        data = data.time_filter(start_time=self.start.strftime('%H%M%S%f')[:-3],
                                end_time=self.end.strftime('%H%M%S%f')[:-3],
                                timezone='EST5EDT')
        data, _ = data[data['VOLUME'] > 0]
        
        data = otp.functions.corp_actions(data,
                                          adjustment_date=int(self.end.strftime('%Y%m%d')),
                                          adjust_rule='SIZE',
                                          fields='VOLUME')
                                        
        return data

Describe the experiment#

The code below defines a complete experiment. The data for features comes from Bars we defined above. splitters provide an easy way to define train/validate/test datasets by splitting the data by time: the most natural way to do it for time series data. pipeline defines the operations on the data to create features out of it. evaluators specify which metrics we’ll like to track.

class VolumePrediction(ml.Experiment):
    datafeeds = [
        Bars(db="NYSE_TAQ_BARS", tick_type="TRD_1M", symbols=["SPY"],
            start=start, end=end, bucket=600)
    ]
    
    splitters = [
        ml.TimeSplitter(
            val_time_range=(datetime(2022, 6, 10), datetime(2022, 7, 9)), 
            test_time_range=(datetime(2022, 7, 10), datetime(2022, 8, 11))
        )
    ]
    
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.FilterValues(),
    ]

    target_columns = ["VOLUME"]
    features_columns = ["VOLUME_LAG_.*"]

    models = [
        ml.LGBMRegressor()
    ]

    train_params = {
        "verbose": 0,
        "n_jobs": 1,
    }
    
    evaluators = [ml.SMAPEEvaluator(), ml.MAEEvaluator(), ml.RMSEEvaluator(), ml.R2Evaluator()]

Run the experiment#

It’s two lines. The run method returns the metrics and the predictions.

exp = VolumePrediction()
exp.run()
({'VOLUME_SMAPE': 0.26471264317834914,
  'VOLUME_MAE': 307707.07801606023,
  'VOLUME_RMSE': 455802.07978705765,
  'VOLUME_R2': 0.5747734343435598},
             VOLUME
 1559  2.347078e+06
 1560  1.599208e+06
 1561  1.938340e+06
 1562  1.711006e+06
 1563  1.733074e+06
 ...            ...
 2451  7.312552e+05
 2452  7.325196e+05
 2453  9.725122e+05
 2454  1.272601e+06
 2455  3.620730e+06
 
 [897 rows x 1 columns])

Benchmark#

We encourage specifying a baseline for all experiments. The default implementation returns the value of the target column from the previous period as the prediction. calc_baseline() returns the metrics achieved by the baseline.

exp.calc_baseline()
{'VOLUME_SMAPE': 0.27216642202147734,
 'VOLUME_MAE': 340701.671875,
 'VOLUME_RMSE': 579629.4488814118,
 'VOLUME_R2': 0.3116398029678321}

Hyperparameter Tuning and Cross Validation#

Trying different variants of the experiment is easy with the class-based implementation. To see if LightGBM params need to be tuned, we inherit from VolumePrediction, specify which params to try, and how to do cross-validation.

class TuneHP(VolumePrediction):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100, 250],
                     'num_leaves': [5, 15, 30],              
                     'max_depth': [2, 4, 6],                    
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        )
    ]

    train_params = {
        "search_cv": {
            "val_type": "WalkForward",  # Simple, Cross, WalkForward
            "folds": 5,
            "eval_metric": "MAPE",
            "search_optimization": "grid", # grid, random
        },
        "verbose": 0,
        "n_jobs": 1,        
    }

When we run the experiment, the best model will be chosen and its metrics / predictions returned. Metrics for all configuration of hyperparameters are saved in the models_cv_results attribute.

exp = TuneHP()
metrics, predictions = exp.run()
print(f'best params: {exp.current_model_params}')
metrics
best params: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}
{'VOLUME_SMAPE': 0.24658693540021667,
 'VOLUME_MAE': 284951.7528798562,
 'VOLUME_RMSE': 465251.7319059286,
 'VOLUME_R2': 0.556959140622568}
exp.models_cv_results[['params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
params mean_test_score std_test_score
21 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.213542 0.014722
12 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.213542 0.014722
13 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.213653 0.011747
14 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.213653 0.011747
10 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.215427 0.012646
11 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.215427 0.012646
3 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.215616 0.015591
4 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.215616 0.015591
5 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.215616 0.015591
23 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.216063 0.011685
22 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.216616 0.011338
19 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.216902 0.012290
20 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.217038 0.012293
18 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.220503 0.016411
9 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.220503 0.016411
0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.226016 0.022466
1 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.226016 0.022466
2 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.226016 0.022466
16 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.235854 0.020886
17 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.235854 0.020886
15 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.236155 0.024035
24 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.236155 0.024035
26 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.239007 0.021650
8 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.239309 0.022139
7 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.239309 0.022139
6 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.239309 0.022139
25 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.240061 0.022762

Choosing the optimal model#

We can extend the experiment above to iterate over different models and different sets of hyperparameters.

class TuneHPMultipleModels(TuneHP):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100, 250],
                     'num_leaves': [5, 15, 30],              
                     'max_depth': [2, 4, 6],                    
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        ),
        ml.XGBRegressor(
            init_params={
                "n_estimators": [100],
                "learning_rate": [.03, .01, .1],
                "max_depth": [3, 4],
                "min_child_weight": [2, 4],
                "max_delta_step": [0],
                "subsample": [0.9],
                "nthread": [2],
            }
        ),
        ml.DecisionTreeRegressor(),
    ]
exp = TuneHPMultipleModels()

exp.run()

print(f'optimal model: {exp.native_model.__class__.__name__}')
print(f'model hyperparameters: {exp.current_model_params}.')
exp.calc_metrics()
optimal model: LGBMRegressor
model hyperparameters: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}.
{'VOLUME_SMAPE': 0.24658693540021667,
 'VOLUME_MAE': 284951.7528798562,
 'VOLUME_RMSE': 465251.7319059286,
 'VOLUME_R2': 0.556959140622568}
exp.models_cv_results[['model', 'params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
model params mean_test_score std_test_score
21 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.213542 0.014722
12 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.213542 0.014722
14 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.213653 0.011747
13 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.213653 0.011747
10 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.215427 0.012646
11 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.215427 0.012646
3 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.215616 0.015591
4 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.215616 0.015591
5 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.215616 0.015591
23 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 30, 'random_state': 0} -0.216063 0.011685
22 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.216616 0.011338
19 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.216902 0.012290
20 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.217038 0.012293
9 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.220503 0.016411
18 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.220503 0.016411
29 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.222627 0.019210
30 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.222948 0.019025
27 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.223614 0.022528
28 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.224695 0.022254
1 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.226016 0.022466
0 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.226016 0.022466
2 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 30, 'random_state': 0} -0.226016 0.022466
16 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.235854 0.020886
17 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.235854 0.020886
24 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.236155 0.024035
15 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.236155 0.024035
26 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.239007 0.021650
8 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 30, 'random_state': 0} -0.239309 0.022139
7 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.239309 0.022139
6 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 250, 'num_leaves': 5, 'random_state': 0} -0.239309 0.022139
36 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.240028 0.021578
25 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 250, 'num_leaves': 15, 'random_state': 0} -0.240061 0.022762
37 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.240857 0.020447
38 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.241585 0.021174
35 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.241838 0.023972
31 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.318247 0.025609
32 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.318805 0.025066
34 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.325197 0.022340
33 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.325706 0.023528
39 DecisionTreeRegressor_id2 {} -0.335602 0.014060
exp.native_model
LGBMRegressor(boosting_type='dart', max_depth=4, num_leaves=5,
              objective='regression', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
exp.current_model_params
{'boosting_type': 'dart',
 'max_depth': 4,
 'n_estimators': 100,
 'num_leaves': 5,
 'random_state': 0}

Feature Engineering#

Trying which features to choose is also easy. First we define the set of candidate features. Then run experiments that include different subsets.

class Features(VolumePrediction):
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.CalcLags(periods=[1], columns=["PRICE_STDEV"]),
        ml.ExpressionOperator(
            new_column_name='HHMM',  
            expression=lambda tick: tick['Time'].dt.strftime('%H%M').apply(int)),
    ]
Features.features_columns = ['VOLUME_LAG_.*', 'PRICE_STDEV.*']
exp_vwap = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.2211715857044462,
 'VOLUME_MAE': 255715.34464622938,
 'VOLUME_RMSE': 399746.18853642343,
 'VOLUME_R2': 0.6729332111907642}
Features.features_columns = ['VOLUME_LAG_.*', 'HHMM']
exp_hhmm = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.2110676978539607,
 'VOLUME_MAE': 245519.38536618612,
 'VOLUME_RMSE': 379135.9675306561,
 'VOLUME_R2': 0.7057897783888781}

Adding a feature in pandas#

We can add features to the DataFrame by first executing onetick-py code.

import pandas as pd

exp = VolumePrediction()

exp.pipeline = [
    ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
    ml.SelectFeatures(columns=["feature_hhmm"]),
    ml.FilterValues(),
]

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["feature_hhmm"] = pd.to_datetime(df["Time"]).dt.strftime("%H%M")

exp.prepare_data(df)

exp.x_train
feature_hhmm VOLUME_LAG_1 VOLUME_LAG_2 VOLUME_LAG_3 VOLUME_LAG_39 VOLUME_LAG_40
40 0940 5468711.0 7394857.0 3041526.0 2518335.0 4408252.0
41 0950 4173901.0 5468711.0 7394857.0 2591760.0 2518335.0
42 1000 3621114.0 4173901.0 5468711.0 2554192.0 2591760.0
43 1010 2894120.0 3621114.0 4173901.0 1932529.0 2554192.0
44 1020 3361844.0 2894120.0 3621114.0 1877159.0 1932529.0
... ... ... ... ... ... ...
853 1510 2584547.0 1260647.0 1235997.0 1446482.0 1232860.0
854 1520 1955293.0 2584547.0 1260647.0 1222981.0 1446482.0
855 1530 1636549.0 1955293.0 2584547.0 1201792.0 1222981.0
856 1540 2128597.0 1636549.0 1955293.0 2090350.0 1201792.0
857 1550 3849834.0 2128597.0 1636549.0 3866129.0 2090350.0

818 rows × 6 columns

Choosing data preprocessing#

Preprocessors are automatically applied in reverse before returning predictions and computing metrics.

feature = ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"])
preprocessors = [
    ml.LimitOutliers(std_num=4, columns=['VOLUME']),
    ml.ToPandas(), # transitions the pipeline from  `onetick-py` to pandas.
    ml.IntradayAveraging(window_days=5, datetime_column='Time', columns=['VOLUME'])
]

pipeline_combinations = [
    [feature],
    [feature, preprocessors[0]],
    [feature, preprocessors[1], preprocessors[2]],
    [feature, preprocessors[0], preprocessors[1], preprocessors[2]],
]

for pipeline_combination in pipeline_combinations:
    pipeline_combination += [ml.FilterValues()]
    exp = VolumePrediction()
    exp.pipeline = pipeline_combination
    exp.run()
    preprocessors_names = str(
        list(map(lambda x: x.__class__.__name__, exp.pipeline[1:]))
    )
    print(f'{preprocessors_names}: {exp.calc_metrics()}')
['FilterValues']: {'VOLUME_SMAPE': 0.26471264317834914, 'VOLUME_MAE': 307707.07801606023, 'VOLUME_RMSE': 455802.07978705765, 'VOLUME_R2': 0.5747734343435598}
['LimitOutliers', 'FilterValues']: {'VOLUME_SMAPE': 0.26209855720664044, 'VOLUME_MAE': 300117.0944934835, 'VOLUME_RMSE': 442689.93339792994, 'VOLUME_R2': 0.5988866868547451}
['ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.324820753423069, 'VOLUME_MAE': 327592.4492867748, 'VOLUME_RMSE': 475749.2168946684, 'VOLUME_R2': 0.5367409117980491}
['LimitOutliers', 'ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.31586031542358867, 'VOLUME_MAE': 319193.35111481807, 'VOLUME_RMSE': 460349.53563005006, 'VOLUME_R2': 0.5662462897714294}

Custom preprocessing (and reverse processing)#

exp = VolumePrediction()

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["VOLUME_orig"] = df["VOLUME"]
df["VOLUME_shifted1"] = df["VOLUME"].shift(1)

# Apply custom data preprocessing
df["VOLUME"] -= df["VOLUME_shifted1"]
df = df.dropna().reset_index(drop=True)

exp.prepare_data(df)
exp.init_fit()
prediction = exp.predict(x=exp.x_test)

# Apply reverse data processing to get the prediction of the real value of the volume.
prediction["VOLUME_orig"] = (
    prediction["VOLUME"] + df.loc[exp.x_test.index]["VOLUME_shifted1"]
)
exp.calc_metrics(df[df.index.isin(prediction.index)][['VOLUME_orig']], prediction[['VOLUME_orig']])
{'VOLUME_orig_SMAPE': 0.2926440236803514,
 'VOLUME_orig_MAE': 328482.3277260032,
 'VOLUME_orig_RMSE': 471798.54541968176,
 'VOLUME_orig_R2': 0.5444028705718461}

Custom models#

Any model can be added by specifying the init_model and fit methods.

import numpy as np
from deepforest import CascadeForestRegressor


class DeepForestRegressor(ml.RegressorModel):
    model_class = CascadeForestRegressor

    def init_model(self, dsf_params={}, init_params={}):
        super().init_model(dsf_params=dsf_params, init_params=init_params)
        self.model = self.model_class(**init_params, verbose=0)
    
    def fit(self, x_train, y_train, eval_set=None):
        return super().fit(x_train.values, np.ravel(y_train.values))

VolumePrediction.models = [DeepForestRegressor()]
exp = VolumePrediction()

metrics, predictions = exp.run()
print(metrics)
exp.native_model
{'VOLUME_SMAPE': 0.2618875180456007, 'VOLUME_MAE': 301749.000983835, 'VOLUME_RMSE': 450845.7124683871, 'VOLUME_R2': 0.583970936045849}
CascadeForestRegressor(verbose=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.