Quick Start#

import onetick.py as otp
from onetick import ml
from datetime import datetime

start = otp.dt(2022, 6, 1, 9, 30)
end = otp.dt(2022, 8, 1, 16, 0)

symbols = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'BRKB', 'TSLA', 'XOM', 'UNH', 'FB']

otp.__version__, ml.__version__

('1.135.0', '1.0.8')

Fetch the data for use in feature creation#

We create a class that inherits from ml.BaseOnetickLoader rather than using onetick-py directly in order to be able to save this code as part of the experiment.

class Bars(ml.BaseOnetickLoader):    
    def get_source(self):
        data = otp.DataSource(db=self.schema.db, tick_type=self.schema.tick_type)
        
        data = data.agg(
            {
                'VOLUME': otp.agg.sum(data['VOLUME']),
                'PRICE_STDEV': otp.agg.stddev(data['VWAP'])
            },
            bucket_interval=self.bucket,
            bucket_time='start'
        )
        
        data = data.time_filter(start_time=self.start.strftime('%H%M%S%f')[:-3],
                                end_time=self.end.strftime('%H%M%S%f')[:-3],
                                timezone='EST5EDT')
        data, _ = data[data['VOLUME'] > 0]
        
        data = otp.functions.corp_actions(data,
                                          adjustment_date=int(self.end.strftime('%Y%m%d')),
                                          adjust_rule='SIZE',
                                          fields='VOLUME')
                                        
        return data

Describe the experiment#

The code below defines a complete experiment. The data for features comes from Bars we defined above. splitters provide an easy way to define train/validate/test datasets by splitting the data by time: the most natural way to do it for time series data. pipeline defines the operations on the data to create features out of it. evaluators specify which metrics we’ll like to track.

class VolumePrediction(ml.Experiment):
    datafeeds = [
        Bars(db="NYSE_TAQ_BARS", tick_type="TRD_1M", symbols=["SPY"],
            start=start, end=end, bucket=600)
    ]
    
    splitters = [
        ml.TimeSplitter(
            val_time_range=(datetime(2022, 7, 1), datetime(2022, 7, 15)), 
            test_time_range=(datetime(2022, 7, 16), datetime(2022, 8, 1))
        )
    ]
    
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.FilterValues(),
    ]

    target_columns = ["VOLUME"]
    features_columns = ["VOLUME_LAG_.*"]

    models = [
        ml.LGBMRegressor()
    ]

    train_params = {
        "verbose": 0,
        "n_jobs": 1,
    }
    
    evaluators = [ml.SMAPEEvaluator(), ml.MAEEvaluator(), ml.RMSEEvaluator(), ml.R2Evaluator()]

Run the experiment#

It’s two lines. The run method returns the metrics and the predictions.

exp = VolumePrediction()
exp.run()

({'VOLUME_SMAPE': 0.2743134902590052,
  'VOLUME_MAE': 326747.8456239629,
  'VOLUME_RMSE': 470008.3582745656,
  'VOLUME_R2': 0.6250890432751781},
             VOLUME
 1169  4.010690e+06
 1170  2.057323e+06
 1171  1.831997e+06
 1172  2.054107e+06
 1173  1.319127e+06
 ...            ...
 1554  1.466467e+06
 1555  1.021391e+06
 1556  1.483459e+06
 1557  2.262253e+06
 1558  4.710050e+06
 
 [390 rows x 1 columns])

Benchmark#

We encourage specifying a baseline for all experiments. The default implementation returns the value of the target column from the previous period as the prediction. calc_baseline() returns the metrics achieved by the baseline.

exp.calc_baseline()

{'VOLUME_SMAPE': 0.2896273436579182,
 'VOLUME_MAE': 369453.6503856041,
 'VOLUME_RMSE': 622992.0611725156,
 'VOLUME_R2': 0.3319314929190944}

Hyperparameter Tuning and Cross Validation#

Trying different variants of the experiment is easy with the class-based implementation. To see if LightGBM params need to be tuned, we inherit from VolumePrediction, specify which params to try, and how to do cross-validation.

class TuneHP(VolumePrediction):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100],
                     'num_leaves': [5, 15],
                     'max_depth': [4, 6],
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        )
    ]

    train_params = {
        "search_cv": {
            "val_type": "WalkForward",  # Simple, Cross, WalkForward
            "folds": 5,
            "eval_metric": "MAPE",
            "search_optimization": "grid", # grid, random
        },
        "verbose": 0,
        "n_jobs": 1,        
    }

When we run the experiment, the best model will be chosen and its metrics / predictions returned. Metrics for all configuration of hyperparameters are saved in the models_cv_results attribute.

exp = TuneHP()
metrics, predictions = exp.run()
print(f'best params: {exp.current_model_params}')
metrics

best params: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}

{'VOLUME_SMAPE': 0.2478779599136575,
 'VOLUME_MAE': 297712.3940706993,
 'VOLUME_RMSE': 485468.3771650355,
 'VOLUME_R2': 0.6000194604104256}

exp.models_cv_results[['params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)

	params	mean_test_score	std_test_score
3	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}	-0.240006	0.031134
2	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}	-0.242201	0.030909
6	{'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}	-0.242201	0.030909
5	{'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0}	-0.244420	0.038989
1	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0}	-0.244707	0.037736
7	{'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}	-0.245251	0.031519
0	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0}	-0.250377	0.035989
4	{'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0}	-0.250377	0.035989

Choosing the optimal model#

We can extend the experiment above to iterate over different models and different sets of hyperparameters.

class TuneHPMultipleModels(TuneHP):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100],
                     'num_leaves': [5, 15],
                     'max_depth': [2, 4],                    
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        ),
        ml.XGBRegressor(
            init_params={
                "n_estimators": [100],
                "learning_rate": [.03, .01, .1],
                "max_depth": [3, 4],
                "min_child_weight": [2, 4],
                "max_delta_step": [0],
                "subsample": [0.9],
                "nthread": [2],
            }
        ),
        ml.DecisionTreeRegressor(),
    ]

exp = TuneHPMultipleModels()

exp.run()

print(f'optimal model: {exp.native_model.__class__.__name__}')
print(f'model hyperparameters: {exp.current_model_params}.')
exp.calc_metrics()

optimal model: LGBMRegressor
model hyperparameters: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}.

{'VOLUME_SMAPE': 0.2478779599136575,
 'VOLUME_MAE': 297712.3940706993,
 'VOLUME_RMSE': 485468.3771650355,
 'VOLUME_R2': 0.6000194604104256}

exp.models_cv_results[['model', 'params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)

	model	params	mean_test_score	std_test_score
7	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}	-0.240006	0.031134
6	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}	-0.242201	0.030909
2	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0}	-0.243641	0.027961
3	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}	-0.243641	0.027961
5	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0}	-0.244707	0.037736
11	XGBRegressor_id1	{'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.248076	0.019661
9	XGBRegressor_id1	{'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.248615	0.019109
4	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0}	-0.250377	0.035989
8	XGBRegressor_id1	{'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.252785	0.021762
10	XGBRegressor_id1	{'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.252928	0.021975
0	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0}	-0.255324	0.032334
1	LGBMRegressor_id0	{'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0}	-0.255324	0.032334
17	XGBRegressor_id1	{'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.267213	0.016335
16	XGBRegressor_id1	{'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.274488	0.020649
19	XGBRegressor_id1	{'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.276416	0.011191
18	XGBRegressor_id1	{'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.280452	0.021100
13	XGBRegressor_id1	{'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.345178	0.047891
12	XGBRegressor_id1	{'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.345687	0.050165
15	XGBRegressor_id1	{'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.350349	0.048117
14	XGBRegressor_id1	{'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9}	-0.353408	0.050002
20	DecisionTreeRegressor_id2	{}	-0.366573	0.010257

exp.native_model

LGBMRegressor(boosting_type='dart', max_depth=4, num_leaves=15,
              objective='regression', random_state=0)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

exp.current_model_params

{'boosting_type': 'dart',
 'max_depth': 4,
 'n_estimators': 100,
 'num_leaves': 15,
 'random_state': 0}

Feature Engineering#

Trying which features to choose is also easy. First we define the set of candidate features. Then run experiments that include different subsets.

class Features(VolumePrediction):
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.CalcLags(periods=[1], columns=["PRICE_STDEV"]),
        ml.ExpressionOperator(
            new_column_name='HHMM',  
            expression=lambda tick: tick['Time'].dt.strftime('%H%M').apply(int)),
    ]

Features.features_columns = ['VOLUME_LAG_.*', 'PRICE_STDEV.*']
exp_vwap = Features()
metrics, predictions = exp_vwap.run()
metrics

{'VOLUME_SMAPE': 0.23360807060180794,
 'VOLUME_MAE': 281094.7707189861,
 'VOLUME_RMSE': 416791.03960749425,
 'VOLUME_R2': 0.7051821885895728}

Features.features_columns = ['VOLUME_LAG_.*', 'HHMM']
exp_hhmm = Features()
metrics, predictions = exp_vwap.run()
metrics

{'VOLUME_SMAPE': 0.2216830358322816,
 'VOLUME_MAE': 254899.56849576902,
 'VOLUME_RMSE': 363871.66144811024,
 'VOLUME_R2': 0.7752946368668814}

Adding a feature in pandas#

We can add features to the DataFrame by first executing onetick-py code.

import pandas as pd

exp = VolumePrediction()

exp.pipeline = [
    ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
    ml.SelectFeatures(columns=["feature_hhmm"]),
    ml.FilterValues(),
]

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["feature_hhmm"] = pd.to_datetime(df["Time"]).dt.strftime("%H%M")

exp.prepare_data(df)

exp.x_train

	feature_hhmm	VOLUME_LAG_1	VOLUME_LAG_2	VOLUME_LAG_3	VOLUME_LAG_39	VOLUME_LAG_40
40	0940	2508915.0	4144521.0	1652787.0	1907568.0	2342203.0
41	0950	1846909.0	2508915.0	4144521.0	1420362.0	1907568.0
42	1000	1242450.0	1846909.0	2508915.0	2028549.0	1420362.0
43	1010	1307462.0	1242450.0	1846909.0	2497041.0	2028549.0
44	1020	1830357.0	1307462.0	1242450.0	1661678.0	2497041.0
...	...	...	...	...	...	...
1633	1510	1478393.0	1427820.0	921560.0	1308503.0	1680990.0
1634	1520	1046464.0	1478393.0	1427820.0	1272052.0	1308503.0
1635	1530	973020.0	1046464.0	1478393.0	2055839.0	1272052.0
1636	1540	1119713.0	973020.0	1046464.0	2834652.0	2055839.0
1637	1550	1747981.0	1119713.0	973020.0	7462887.0	2834652.0

857 rows × 6 columns

Choosing data preprocessing#

Preprocessors are automatically applied in reverse before returning predictions and computing metrics.

feature = ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"])
preprocessors = [
    ml.LimitOutliers(std_num=4, columns=['VOLUME']),
    ml.ToPandas(), # transitions the pipeline from  `onetick-py` to pandas.
    ml.IntradayAveraging(window_days=5, datetime_column='Time', columns=['VOLUME'])
]

pipeline_combinations = [
    [feature],
    [feature, preprocessors[0]],
    [feature, preprocessors[1], preprocessors[2]],
    [feature, preprocessors[0], preprocessors[1], preprocessors[2]],
]

for pipeline_combination in pipeline_combinations:
    pipeline_combination += [ml.FilterValues()]
    exp = VolumePrediction()
    exp.pipeline = pipeline_combination
    exp.run()
    preprocessors_names = str(
        list(map(lambda x: x.__class__.__name__, exp.pipeline[1:]))
    )
    print(f'{preprocessors_names}: {exp.calc_metrics()}')

['FilterValues']: {'VOLUME_SMAPE': 0.2743134902590052, 'VOLUME_MAE': 326747.8456239629, 'VOLUME_RMSE': 470008.3582745656, 'VOLUME_R2': 0.6250890432751781}

['LimitOutliers', 'FilterValues']: {'VOLUME_SMAPE': 0.27119459674343876, 'VOLUME_MAE': 324566.37284875725, 'VOLUME_RMSE': 468368.3097248595, 'VOLUME_R2': 0.627700909177805}

['ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.31568997744798893, 'VOLUME_MAE': 357457.52574017394, 'VOLUME_RMSE': 502357.34866575064, 'VOLUME_R2': 0.5717055074917959}

['LimitOutliers', 'ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.32005699418478273, 'VOLUME_MAE': 361175.59018996696, 'VOLUME_RMSE': 512452.57496893825, 'VOLUME_R2': 0.5543187845039784}

Custom preprocessing (and reverse processing)#

exp = VolumePrediction()

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["VOLUME_orig"] = df["VOLUME"]
df["VOLUME_shifted1"] = df["VOLUME"].shift(1)

# Apply custom data preprocessing
df["VOLUME"] -= df["VOLUME_shifted1"]
df = df.dropna().reset_index(drop=True)

exp.prepare_data(df)
exp.init_fit()
prediction = exp.predict(x=exp.x_test)

# Apply reverse data processing to get the prediction of the real value of the volume.
prediction["VOLUME_orig"] = (
    prediction["VOLUME"] + df.loc[exp.x_test.index]["VOLUME_shifted1"]
)
exp.calc_metrics(df[df.index.isin(prediction.index)][['VOLUME_orig']], prediction[['VOLUME_orig']])

{'VOLUME_orig_SMAPE': 0.2920900770070016,
 'VOLUME_orig_MAE': 345819.30728889885,
 'VOLUME_orig_RMSE': 524730.4546777669,
 'VOLUME_orig_R2': 0.5327067452924743}

Custom models#

Any model can be added by specifying the init_model and fit methods.

import numpy as np
from deepforest import CascadeForestRegressor


class DeepForestRegressor(ml.RegressorModel):
    model_class = CascadeForestRegressor

    def init_model(self, dsf_params={}, init_params={}):
        super().init_model(dsf_params=dsf_params, init_params=init_params)
        self.model = self.model_class(**init_params, verbose=0)
    
    def fit(self, x_train, y_train, eval_set=None):
        return super().fit(x_train.values, np.ravel(y_train.values))

VolumePrediction.models = [DeepForestRegressor()]
exp = VolumePrediction()

metrics, predictions = exp.run()
print(metrics)
exp.native_model

{'VOLUME_SMAPE': 0.2665700566114956, 'VOLUME_MAE': 322841.09200641024, 'VOLUME_RMSE': 471288.5314573883, 'VOLUME_R2': 0.6230439537727517}

CascadeForestRegressor(verbose=0)