Quick Start#

import onetick.py as otp
from onetick import ml
from datetime import datetime

start = otp.dt(2022, 6, 1, 9, 30)
end = otp.dt(2022, 8, 1, 16, 0)

symbols = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'BRKB', 'TSLA', 'XOM', 'UNH', 'FB']

otp.__version__, ml.__version__
('1.131.0', '1.0.8')

Fetch the data for use in feature creation#

We create a class that inherits from ml.BaseOnetickLoader rather than using onetick-py directly in order to be able to save this code as part of the experiment.

class Bars(ml.BaseOnetickLoader):    
    def get_source(self):
        data = otp.DataSource(db=self.schema.db, tick_type=self.schema.tick_type)
        
        data = data.agg(
            {
                'VOLUME': otp.agg.sum(data['VOLUME']),
                'PRICE_STDEV': otp.agg.stddev(data['VWAP'])
            },
            bucket_interval=self.bucket,
            bucket_time='start'
        )
        
        data = data.time_filter(start_time=self.start.strftime('%H%M%S%f')[:-3],
                                end_time=self.end.strftime('%H%M%S%f')[:-3],
                                timezone='EST5EDT')
        data, _ = data[data['VOLUME'] > 0]
        
        data = otp.functions.corp_actions(data,
                                          adjustment_date=int(self.end.strftime('%Y%m%d')),
                                          adjust_rule='SIZE',
                                          fields='VOLUME')
                                        
        return data

Describe the experiment#

The code below defines a complete experiment. The data for features comes from Bars we defined above. splitters provide an easy way to define train/validate/test datasets by splitting the data by time: the most natural way to do it for time series data. pipeline defines the operations on the data to create features out of it. evaluators specify which metrics we’ll like to track.

class VolumePrediction(ml.Experiment):
    datafeeds = [
        Bars(db="NYSE_TAQ_BARS", tick_type="TRD_1M", symbols=["SPY"],
            start=start, end=end, bucket=600)
    ]
    
    splitters = [
        ml.TimeSplitter(
            val_time_range=(datetime(2022, 7, 1), datetime(2022, 7, 15)), 
            test_time_range=(datetime(2022, 7, 16), datetime(2022, 8, 1))
        )
    ]
    
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.FilterValues(),
    ]

    target_columns = ["VOLUME"]
    features_columns = ["VOLUME_LAG_.*"]

    models = [
        ml.LGBMRegressor()
    ]

    train_params = {
        "verbose": 0,
        "n_jobs": 1,
    }
    
    evaluators = [ml.SMAPEEvaluator(), ml.MAEEvaluator(), ml.RMSEEvaluator(), ml.R2Evaluator()]

Run the experiment#

It’s two lines. The run method returns the metrics and the predictions.

exp = VolumePrediction()
exp.run()
({'VOLUME_SMAPE': 0.2743134902590052,
  'VOLUME_MAE': 326747.8456239629,
  'VOLUME_RMSE': 470008.3582745656,
  'VOLUME_R2': 0.6250890432751781},
             VOLUME
 1169  4.010690e+06
 1170  2.057323e+06
 1171  1.831997e+06
 1172  2.054107e+06
 1173  1.319127e+06
 ...            ...
 1554  1.466467e+06
 1555  1.021391e+06
 1556  1.483459e+06
 1557  2.262253e+06
 1558  4.710050e+06
 
 [390 rows x 1 columns])

Benchmark#

We encourage specifying a baseline for all experiments. The default implementation returns the value of the target column from the previous period as the prediction. calc_baseline() returns the metrics achieved by the baseline.

exp.calc_baseline()
{'VOLUME_SMAPE': 0.2896273436579182,
 'VOLUME_MAE': 369453.6503856041,
 'VOLUME_RMSE': 622992.0611725156,
 'VOLUME_R2': 0.3319314929190944}

Hyperparameter Tuning and Cross Validation#

Trying different variants of the experiment is easy with the class-based implementation. To see if LightGBM params need to be tuned, we inherit from VolumePrediction, specify which params to try, and how to do cross-validation.

class TuneHP(VolumePrediction):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100],
                     'num_leaves': [5, 15],
                     'max_depth': [4, 6],
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        )
    ]

    train_params = {
        "search_cv": {
            "val_type": "WalkForward",  # Simple, Cross, WalkForward
            "folds": 5,
            "eval_metric": "MAPE",
            "search_optimization": "grid", # grid, random
        },
        "verbose": 0,
        "n_jobs": 1,        
    }

When we run the experiment, the best model will be chosen and its metrics / predictions returned. Metrics for all configuration of hyperparameters are saved in the models_cv_results attribute.

exp = TuneHP()
metrics, predictions = exp.run()
print(f'best params: {exp.current_model_params}')
metrics
best params: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}
{'VOLUME_SMAPE': 0.2478779599136575,
 'VOLUME_MAE': 297712.3940706993,
 'VOLUME_RMSE': 485468.3771650355,
 'VOLUME_R2': 0.6000194604104256}
exp.models_cv_results[['params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
params mean_test_score std_test_score
3 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.240006 0.031134
2 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.242201 0.030909
6 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.242201 0.030909
5 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.244420 0.038989
1 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.244707 0.037736
7 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.245251 0.031519
0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.250377 0.035989
4 {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.250377 0.035989

Choosing the optimal model#

We can extend the experiment above to iterate over different models and different sets of hyperparameters.

class TuneHPMultipleModels(TuneHP):
    models = [
        ml.LGBMRegressor(
                init_params={'n_estimators': [50, 100],
                     'num_leaves': [5, 15],
                     'max_depth': [2, 4],                    
                     'boosting_type': ['dart'],
                     'random_state': [0],
                     }
        ),
        ml.XGBRegressor(
            init_params={
                "n_estimators": [100],
                "learning_rate": [.03, .01, .1],
                "max_depth": [3, 4],
                "min_child_weight": [2, 4],
                "max_delta_step": [0],
                "subsample": [0.9],
                "nthread": [2],
            }
        ),
        ml.DecisionTreeRegressor(),
    ]
exp = TuneHPMultipleModels()

exp.run()

print(f'optimal model: {exp.native_model.__class__.__name__}')
print(f'model hyperparameters: {exp.current_model_params}.')
exp.calc_metrics()
optimal model: LGBMRegressor
model hyperparameters: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}.
{'VOLUME_SMAPE': 0.2478779599136575,
 'VOLUME_MAE': 297712.3940706993,
 'VOLUME_RMSE': 485468.3771650355,
 'VOLUME_R2': 0.6000194604104256}
exp.models_cv_results[['model', 'params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
model params mean_test_score std_test_score
7 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.240006 0.031134
6 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.242201 0.030909
2 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} -0.243641 0.027961
3 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} -0.243641 0.027961
5 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.244707 0.037736
11 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.248076 0.019661
9 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.248615 0.019109
4 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.250377 0.035989
8 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.252785 0.021762
10 XGBRegressor_id1 {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.252928 0.021975
0 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} -0.255324 0.032334
1 LGBMRegressor_id0 {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} -0.255324 0.032334
17 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.267213 0.016335
16 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.274488 0.020649
19 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.276416 0.011191
18 XGBRegressor_id1 {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.280452 0.021100
13 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.345178 0.047891
12 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.345687 0.050165
15 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.350349 0.048117
14 XGBRegressor_id1 {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} -0.353408 0.050002
20 DecisionTreeRegressor_id2 {} -0.367982 0.012032
exp.native_model
LGBMRegressor(boosting_type='dart', max_depth=4, num_leaves=15,
              objective='regression', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
exp.current_model_params
{'boosting_type': 'dart',
 'max_depth': 4,
 'n_estimators': 100,
 'num_leaves': 15,
 'random_state': 0}

Feature Engineering#

Trying which features to choose is also easy. First we define the set of candidate features. Then run experiments that include different subsets.

class Features(VolumePrediction):
    pipeline = [
        ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
        ml.CalcLags(periods=[1], columns=["PRICE_STDEV"]),
        ml.ExpressionOperator(
            new_column_name='HHMM',  
            expression=lambda tick: tick['Time'].dt.strftime('%H%M').apply(int)),
    ]
Features.features_columns = ['VOLUME_LAG_.*', 'PRICE_STDEV.*']
exp_vwap = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.23360807060180794,
 'VOLUME_MAE': 281094.7707189861,
 'VOLUME_RMSE': 416791.03960749425,
 'VOLUME_R2': 0.7051821885895728}
Features.features_columns = ['VOLUME_LAG_.*', 'HHMM']
exp_hhmm = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.2216830358322816,
 'VOLUME_MAE': 254899.56849576902,
 'VOLUME_RMSE': 363871.66144811024,
 'VOLUME_R2': 0.7752946368668814}

Adding a feature in pandas#

We can add features to the DataFrame by first executing onetick-py code.

import pandas as pd

exp = VolumePrediction()

exp.pipeline = [
    ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
    ml.SelectFeatures(columns=["feature_hhmm"]),
    ml.FilterValues(),
]

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["feature_hhmm"] = pd.to_datetime(df["Time"]).dt.strftime("%H%M")

exp.prepare_data(df)

exp.x_train
feature_hhmm VOLUME_LAG_1 VOLUME_LAG_2 VOLUME_LAG_3 VOLUME_LAG_39 VOLUME_LAG_40
40 0940 2508915.0 4144521.0 1652787.0 1907568.0 2342203.0
41 0950 1846909.0 2508915.0 4144521.0 1420362.0 1907568.0
42 1000 1242450.0 1846909.0 2508915.0 2028549.0 1420362.0
43 1010 1307462.0 1242450.0 1846909.0 2497041.0 2028549.0
44 1020 1830357.0 1307462.0 1242450.0 1661678.0 2497041.0
... ... ... ... ... ... ...
1633 1510 1478393.0 1427820.0 921560.0 1308503.0 1680990.0
1634 1520 1046464.0 1478393.0 1427820.0 1272052.0 1308503.0
1635 1530 973020.0 1046464.0 1478393.0 2055839.0 1272052.0
1636 1540 1119713.0 973020.0 1046464.0 2834652.0 2055839.0
1637 1550 1747981.0 1119713.0 973020.0 7462887.0 2834652.0

857 rows × 6 columns

Choosing data preprocessing#

Preprocessors are automatically applied in reverse before returning predictions and computing metrics.

feature = ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"])
preprocessors = [
    ml.LimitOutliers(std_num=4, columns=['VOLUME']),
    ml.ToPandas(), # transitions the pipeline from  `onetick-py` to pandas.
    ml.IntradayAveraging(window_days=5, datetime_column='Time', columns=['VOLUME'])
]

pipeline_combinations = [
    [feature],
    [feature, preprocessors[0]],
    [feature, preprocessors[1], preprocessors[2]],
    [feature, preprocessors[0], preprocessors[1], preprocessors[2]],
]

for pipeline_combination in pipeline_combinations:
    pipeline_combination += [ml.FilterValues()]
    exp = VolumePrediction()
    exp.pipeline = pipeline_combination
    exp.run()
    preprocessors_names = str(
        list(map(lambda x: x.__class__.__name__, exp.pipeline[1:]))
    )
    print(f'{preprocessors_names}: {exp.calc_metrics()}')
['FilterValues']: {'VOLUME_SMAPE': 0.2743134902590052, 'VOLUME_MAE': 326747.8456239629, 'VOLUME_RMSE': 470008.3582745656, 'VOLUME_R2': 0.6250890432751781}
['LimitOutliers', 'FilterValues']: {'VOLUME_SMAPE': 0.27119459674343876, 'VOLUME_MAE': 324566.37284875725, 'VOLUME_RMSE': 468368.3097248595, 'VOLUME_R2': 0.627700909177805}
['ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.31568997744798893, 'VOLUME_MAE': 357457.52574017394, 'VOLUME_RMSE': 502357.34866575064, 'VOLUME_R2': 0.5717055074917959}
['LimitOutliers', 'ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.32005699418478273, 'VOLUME_MAE': 361175.59018996696, 'VOLUME_RMSE': 512452.57496893825, 'VOLUME_R2': 0.5543187845039784}

Custom preprocessing (and reverse processing)#

exp = VolumePrediction()

src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src) 

df["VOLUME_orig"] = df["VOLUME"]
df["VOLUME_shifted1"] = df["VOLUME"].shift(1)

# Apply custom data preprocessing
df["VOLUME"] -= df["VOLUME_shifted1"]
df = df.dropna().reset_index(drop=True)

exp.prepare_data(df)
exp.init_fit()
prediction = exp.predict(x=exp.x_test)

# Apply reverse data processing to get the prediction of the real value of the volume.
prediction["VOLUME_orig"] = (
    prediction["VOLUME"] + df.loc[exp.x_test.index]["VOLUME_shifted1"]
)
exp.calc_metrics(df[df.index.isin(prediction.index)][['VOLUME_orig']], prediction[['VOLUME_orig']])
{'VOLUME_orig_SMAPE': 0.2920900770070016,
 'VOLUME_orig_MAE': 345819.30728889885,
 'VOLUME_orig_RMSE': 524730.4546777669,
 'VOLUME_orig_R2': 0.5327067452924743}

Custom models#

Any model can be added by specifying the init_model and fit methods.

import numpy as np
from deepforest import CascadeForestRegressor


class DeepForestRegressor(ml.RegressorModel):
    model_class = CascadeForestRegressor

    def init_model(self, dsf_params={}, init_params={}):
        super().init_model(dsf_params=dsf_params, init_params=init_params)
        self.model = self.model_class(**init_params, verbose=0)
    
    def fit(self, x_train, y_train, eval_set=None):
        return super().fit(x_train.values, np.ravel(y_train.values))

VolumePrediction.models = [DeepForestRegressor()]
exp = VolumePrediction()

metrics, predictions = exp.run()
print(metrics)
exp.native_model
{'VOLUME_SMAPE': 0.2656213918060064, 'VOLUME_MAE': 322441.92260897433, 'VOLUME_RMSE': 471516.75728255726, 'VOLUME_R2': 0.6226787764784298}
CascadeForestRegressor(verbose=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.