Quick Start
Contents
Quick Start#
import onetick.py as otp
from onetick import ml
from datetime import datetime
start = otp.dt(2022, 6, 1, 9, 30)
end = otp.dt(2022, 8, 1, 16, 0)
symbols = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'BRKB', 'TSLA', 'XOM', 'UNH', 'FB']
otp.__version__, ml.__version__
('1.135.0', '1.0.8')
Fetch the data for use in feature creation#
We create a class that inherits from ml.BaseOnetickLoader
rather than using onetick-py
directly in order to be able to save this code as part of the experiment.
class Bars(ml.BaseOnetickLoader):
def get_source(self):
data = otp.DataSource(db=self.schema.db, tick_type=self.schema.tick_type)
data = data.agg(
{
'VOLUME': otp.agg.sum(data['VOLUME']),
'PRICE_STDEV': otp.agg.stddev(data['VWAP'])
},
bucket_interval=self.bucket,
bucket_time='start'
)
data = data.time_filter(start_time=self.start.strftime('%H%M%S%f')[:-3],
end_time=self.end.strftime('%H%M%S%f')[:-3],
timezone='EST5EDT')
data, _ = data[data['VOLUME'] > 0]
data = otp.functions.corp_actions(data,
adjustment_date=int(self.end.strftime('%Y%m%d')),
adjust_rule='SIZE',
fields='VOLUME')
return data
Describe the experiment#
The code below defines a complete experiment. The data for features comes from Bars
we defined above. splitters
provide an easy way to define train/validate/test datasets by splitting the data by time: the most natural way to do it for time series data. pipeline
defines the operations on the data to create features out of it. evaluators
specify which metrics we’ll like to track.
class VolumePrediction(ml.Experiment):
datafeeds = [
Bars(db="NYSE_TAQ_BARS", tick_type="TRD_1M", symbols=["SPY"],
start=start, end=end, bucket=600)
]
splitters = [
ml.TimeSplitter(
val_time_range=(datetime(2022, 7, 1), datetime(2022, 7, 15)),
test_time_range=(datetime(2022, 7, 16), datetime(2022, 8, 1))
)
]
pipeline = [
ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
ml.FilterValues(),
]
target_columns = ["VOLUME"]
features_columns = ["VOLUME_LAG_.*"]
models = [
ml.LGBMRegressor()
]
train_params = {
"verbose": 0,
"n_jobs": 1,
}
evaluators = [ml.SMAPEEvaluator(), ml.MAEEvaluator(), ml.RMSEEvaluator(), ml.R2Evaluator()]
Run the experiment#
It’s two lines. The run
method returns the metrics and the predictions.
exp = VolumePrediction()
exp.run()
({'VOLUME_SMAPE': 0.2743134902590052,
'VOLUME_MAE': 326747.8456239629,
'VOLUME_RMSE': 470008.3582745656,
'VOLUME_R2': 0.6250890432751781},
VOLUME
1169 4.010690e+06
1170 2.057323e+06
1171 1.831997e+06
1172 2.054107e+06
1173 1.319127e+06
... ...
1554 1.466467e+06
1555 1.021391e+06
1556 1.483459e+06
1557 2.262253e+06
1558 4.710050e+06
[390 rows x 1 columns])
Benchmark#
We encourage specifying a baseline for all experiments. The default implementation returns the value of the target
column from the previous period as the prediction. calc_baseline()
returns the metrics achieved by the baseline.
exp.calc_baseline()
{'VOLUME_SMAPE': 0.2896273436579182,
'VOLUME_MAE': 369453.6503856041,
'VOLUME_RMSE': 622992.0611725156,
'VOLUME_R2': 0.3319314929190944}
Hyperparameter Tuning and Cross Validation#
Trying different variants of the experiment is easy with the class-based implementation. To see if LightGBM params need to be tuned, we inherit from VolumePrediction
, specify which params to try, and how to do cross-validation.
class TuneHP(VolumePrediction):
models = [
ml.LGBMRegressor(
init_params={'n_estimators': [50, 100],
'num_leaves': [5, 15],
'max_depth': [4, 6],
'boosting_type': ['dart'],
'random_state': [0],
}
)
]
train_params = {
"search_cv": {
"val_type": "WalkForward", # Simple, Cross, WalkForward
"folds": 5,
"eval_metric": "MAPE",
"search_optimization": "grid", # grid, random
},
"verbose": 0,
"n_jobs": 1,
}
When we run the experiment, the best model will be chosen and its metrics / predictions returned. Metrics for all configuration of hyperparameters are saved in the models_cv_results
attribute.
exp = TuneHP()
metrics, predictions = exp.run()
print(f'best params: {exp.current_model_params}')
metrics
best params: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}
{'VOLUME_SMAPE': 0.2478779599136575,
'VOLUME_MAE': 297712.3940706993,
'VOLUME_RMSE': 485468.3771650355,
'VOLUME_R2': 0.6000194604104256}
exp.models_cv_results[['params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
params | mean_test_score | std_test_score | |
---|---|---|---|
3 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} | -0.240006 | 0.031134 |
2 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} | -0.242201 | 0.030909 |
6 | {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} | -0.242201 | 0.030909 |
5 | {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} | -0.244420 | 0.038989 |
1 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} | -0.244707 | 0.037736 |
7 | {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} | -0.245251 | 0.031519 |
0 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} | -0.250377 | 0.035989 |
4 | {'boosting_type': 'dart', 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} | -0.250377 | 0.035989 |
Choosing the optimal model#
We can extend the experiment above to iterate over different models and different sets of hyperparameters.
class TuneHPMultipleModels(TuneHP):
models = [
ml.LGBMRegressor(
init_params={'n_estimators': [50, 100],
'num_leaves': [5, 15],
'max_depth': [2, 4],
'boosting_type': ['dart'],
'random_state': [0],
}
),
ml.XGBRegressor(
init_params={
"n_estimators": [100],
"learning_rate": [.03, .01, .1],
"max_depth": [3, 4],
"min_child_weight": [2, 4],
"max_delta_step": [0],
"subsample": [0.9],
"nthread": [2],
}
),
ml.DecisionTreeRegressor(),
]
exp = TuneHPMultipleModels()
exp.run()
print(f'optimal model: {exp.native_model.__class__.__name__}')
print(f'model hyperparameters: {exp.current_model_params}.')
exp.calc_metrics()
optimal model: LGBMRegressor
model hyperparameters: {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0}.
{'VOLUME_SMAPE': 0.2478779599136575,
'VOLUME_MAE': 297712.3940706993,
'VOLUME_RMSE': 485468.3771650355,
'VOLUME_R2': 0.6000194604104256}
exp.models_cv_results[['model', 'params', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False)
model | params | mean_test_score | std_test_score | |
---|---|---|---|---|
7 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} | -0.240006 | 0.031134 |
6 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} | -0.242201 | 0.030909 |
2 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 0} | -0.243641 | 0.027961 |
3 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 100, 'num_leaves': 15, 'random_state': 0} | -0.243641 | 0.027961 |
5 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} | -0.244707 | 0.037736 |
11 | XGBRegressor_id1 | {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.248076 | 0.019661 |
9 | XGBRegressor_id1 | {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.248615 | 0.019109 |
4 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 4, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} | -0.250377 | 0.035989 |
8 | XGBRegressor_id1 | {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.252785 | 0.021762 |
10 | XGBRegressor_id1 | {'learning_rate': 0.03, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.252928 | 0.021975 |
0 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 5, 'random_state': 0} | -0.255324 | 0.032334 |
1 | LGBMRegressor_id0 | {'boosting_type': 'dart', 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 15, 'random_state': 0} | -0.255324 | 0.032334 |
17 | XGBRegressor_id1 | {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.267213 | 0.016335 |
16 | XGBRegressor_id1 | {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.274488 | 0.020649 |
19 | XGBRegressor_id1 | {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.276416 | 0.011191 |
18 | XGBRegressor_id1 | {'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.280452 | 0.021100 |
13 | XGBRegressor_id1 | {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.345178 | 0.047891 |
12 | XGBRegressor_id1 | {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.345687 | 0.050165 |
15 | XGBRegressor_id1 | {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.350349 | 0.048117 |
14 | XGBRegressor_id1 | {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 2, 'subsample': 0.9} | -0.353408 | 0.050002 |
20 | DecisionTreeRegressor_id2 | {} | -0.366573 | 0.010257 |
exp.native_model
LGBMRegressor(boosting_type='dart', max_depth=4, num_leaves=15, objective='regression', random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMRegressor(boosting_type='dart', max_depth=4, num_leaves=15, objective='regression', random_state=0)
exp.current_model_params
{'boosting_type': 'dart',
'max_depth': 4,
'n_estimators': 100,
'num_leaves': 15,
'random_state': 0}
Feature Engineering#
Trying which features to choose is also easy. First we define the set of candidate features. Then run experiments that include different subsets.
class Features(VolumePrediction):
pipeline = [
ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
ml.CalcLags(periods=[1], columns=["PRICE_STDEV"]),
ml.ExpressionOperator(
new_column_name='HHMM',
expression=lambda tick: tick['Time'].dt.strftime('%H%M').apply(int)),
]
Features.features_columns = ['VOLUME_LAG_.*', 'PRICE_STDEV.*']
exp_vwap = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.23360807060180794,
'VOLUME_MAE': 281094.7707189861,
'VOLUME_RMSE': 416791.03960749425,
'VOLUME_R2': 0.7051821885895728}
Features.features_columns = ['VOLUME_LAG_.*', 'HHMM']
exp_hhmm = Features()
metrics, predictions = exp_vwap.run()
metrics
{'VOLUME_SMAPE': 0.2216830358322816,
'VOLUME_MAE': 254899.56849576902,
'VOLUME_RMSE': 363871.66144811024,
'VOLUME_R2': 0.7752946368668814}
Adding a feature in pandas#
We can add features to the DataFrame by first executing onetick-py
code.
import pandas as pd
exp = VolumePrediction()
exp.pipeline = [
ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
ml.SelectFeatures(columns=["feature_hhmm"]),
ml.FilterValues(),
]
src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src)
df["feature_hhmm"] = pd.to_datetime(df["Time"]).dt.strftime("%H%M")
exp.prepare_data(df)
exp.x_train
feature_hhmm | VOLUME_LAG_1 | VOLUME_LAG_2 | VOLUME_LAG_3 | VOLUME_LAG_39 | VOLUME_LAG_40 | |
---|---|---|---|---|---|---|
40 | 0940 | 2508915.0 | 4144521.0 | 1652787.0 | 1907568.0 | 2342203.0 |
41 | 0950 | 1846909.0 | 2508915.0 | 4144521.0 | 1420362.0 | 1907568.0 |
42 | 1000 | 1242450.0 | 1846909.0 | 2508915.0 | 2028549.0 | 1420362.0 |
43 | 1010 | 1307462.0 | 1242450.0 | 1846909.0 | 2497041.0 | 2028549.0 |
44 | 1020 | 1830357.0 | 1307462.0 | 1242450.0 | 1661678.0 | 2497041.0 |
... | ... | ... | ... | ... | ... | ... |
1633 | 1510 | 1478393.0 | 1427820.0 | 921560.0 | 1308503.0 | 1680990.0 |
1634 | 1520 | 1046464.0 | 1478393.0 | 1427820.0 | 1272052.0 | 1308503.0 |
1635 | 1530 | 973020.0 | 1046464.0 | 1478393.0 | 2055839.0 | 1272052.0 |
1636 | 1540 | 1119713.0 | 973020.0 | 1046464.0 | 2834652.0 | 2055839.0 |
1637 | 1550 | 1747981.0 | 1119713.0 | 973020.0 | 7462887.0 | 2834652.0 |
857 rows × 6 columns
Choosing data preprocessing#
Preprocessors are automatically applied in reverse before returning predictions and computing metrics.
feature = ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"])
preprocessors = [
ml.LimitOutliers(std_num=4, columns=['VOLUME']),
ml.ToPandas(), # transitions the pipeline from `onetick-py` to pandas.
ml.IntradayAveraging(window_days=5, datetime_column='Time', columns=['VOLUME'])
]
pipeline_combinations = [
[feature],
[feature, preprocessors[0]],
[feature, preprocessors[1], preprocessors[2]],
[feature, preprocessors[0], preprocessors[1], preprocessors[2]],
]
for pipeline_combination in pipeline_combinations:
pipeline_combination += [ml.FilterValues()]
exp = VolumePrediction()
exp.pipeline = pipeline_combination
exp.run()
preprocessors_names = str(
list(map(lambda x: x.__class__.__name__, exp.pipeline[1:]))
)
print(f'{preprocessors_names}: {exp.calc_metrics()}')
['FilterValues']: {'VOLUME_SMAPE': 0.2743134902590052, 'VOLUME_MAE': 326747.8456239629, 'VOLUME_RMSE': 470008.3582745656, 'VOLUME_R2': 0.6250890432751781}
['LimitOutliers', 'FilterValues']: {'VOLUME_SMAPE': 0.27119459674343876, 'VOLUME_MAE': 324566.37284875725, 'VOLUME_RMSE': 468368.3097248595, 'VOLUME_R2': 0.627700909177805}
['ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.31568997744798893, 'VOLUME_MAE': 357457.52574017394, 'VOLUME_RMSE': 502357.34866575064, 'VOLUME_R2': 0.5717055074917959}
['LimitOutliers', 'ToPandas', 'IntradayAveraging', 'FilterValues']: {'VOLUME_SMAPE': 0.32005699418478273, 'VOLUME_MAE': 361175.59018996696, 'VOLUME_RMSE': 512452.57496893825, 'VOLUME_R2': 0.5543187845039784}
Custom preprocessing (and reverse processing)#
exp = VolumePrediction()
src = exp.get_data()
df = VolumePrediction().datafeeds[0].run(src)
df["VOLUME_orig"] = df["VOLUME"]
df["VOLUME_shifted1"] = df["VOLUME"].shift(1)
# Apply custom data preprocessing
df["VOLUME"] -= df["VOLUME_shifted1"]
df = df.dropna().reset_index(drop=True)
exp.prepare_data(df)
exp.init_fit()
prediction = exp.predict(x=exp.x_test)
# Apply reverse data processing to get the prediction of the real value of the volume.
prediction["VOLUME_orig"] = (
prediction["VOLUME"] + df.loc[exp.x_test.index]["VOLUME_shifted1"]
)
exp.calc_metrics(df[df.index.isin(prediction.index)][['VOLUME_orig']], prediction[['VOLUME_orig']])
{'VOLUME_orig_SMAPE': 0.2920900770070016,
'VOLUME_orig_MAE': 345819.30728889885,
'VOLUME_orig_RMSE': 524730.4546777669,
'VOLUME_orig_R2': 0.5327067452924743}
Custom models#
Any model can be added by specifying the init_model
and fit
methods.
import numpy as np
from deepforest import CascadeForestRegressor
class DeepForestRegressor(ml.RegressorModel):
model_class = CascadeForestRegressor
def init_model(self, dsf_params={}, init_params={}):
super().init_model(dsf_params=dsf_params, init_params=init_params)
self.model = self.model_class(**init_params, verbose=0)
def fit(self, x_train, y_train, eval_set=None):
return super().fit(x_train.values, np.ravel(y_train.values))
VolumePrediction.models = [DeepForestRegressor()]
exp = VolumePrediction()
metrics, predictions = exp.run()
print(metrics)
exp.native_model
{'VOLUME_SMAPE': 0.2665700566114956, 'VOLUME_MAE': 322841.09200641024, 'VOLUME_RMSE': 471288.5314573883, 'VOLUME_R2': 0.6230439537727517}
CascadeForestRegressor(verbose=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CascadeForestRegressor(verbose=0)