Experiment Management
Contents
Experiment Management#
Introduction#
This guide illustrates how to track experiments using MLFlow and how to serve models.
# %%capture
# !pip install -U onetick-ml
from onetick import ml
import onetick.py as otp
'otml:', ml.__version__, ' otp:', otp.__version__
('otml:', '1.0.8', ' otp:', '1.135.0')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
start = otp.dt(2022, 5, 10, 9, 30)
end = otp.dt(2022, 11, 10, 16, 0)
class VolumePrediction(ml.Experiment):
target_columns = ["VOLUME"]
features_columns = ["VOLUME_LAG_.*"]
# DATA
datafeeds = [
ml.OneTickBarsDatafeed(
db="NYSE_TAQ_BARS",
tick_type="TRD_1M",
symbols=["SPY"],
start=start,
end=end,
bucket=600,
)
]
splitters = [
ml.PercentageSplitter(test_size=0.15, val_size=0.15)
]
pipeline = [
ml.CalcLags(periods=[1, 2, 3, 39, 40], columns=["VOLUME"]),
]
# MODEL
models = [
ml.LGBMRegressor()
]
train_params = {"verbose": 0}
# EVALUATION
evaluators = [ml.MAPEEvaluator(),
ml.MAEEvaluator(),
ml.RMSEEvaluator(),
ml.R2Evaluator()]
Run full experiment cycle, get resulted metrics and predictions#
exp = VolumePrediction()
config = exp.serialize_config()
metrics, predictions = exp.run()
exp.x_test
VOLUME_LAG_1 | VOLUME_LAG_2 | VOLUME_LAG_3 | VOLUME_LAG_39 | VOLUME_LAG_40 | |
---|---|---|---|---|---|
4276 | 1332339.0 | 1118539.0 | 1042534.0 | 1903358.0 | 2712928.0 |
4277 | 1346291.0 | 1332339.0 | 1118539.0 | 1262840.0 | 1903358.0 |
4278 | 730047.0 | 1346291.0 | 1332339.0 | 1339100.0 | 1262840.0 |
4279 | 1117967.0 | 730047.0 | 1346291.0 | 1386015.0 | 1339100.0 |
4280 | 1162810.0 | 1117967.0 | 730047.0 | 1382818.0 | 1386015.0 |
... | ... | ... | ... | ... | ... |
5026 | 1680948.0 | 1553053.0 | 784539.0 | 1282829.0 | 1377165.0 |
5027 | 2092322.0 | 1680948.0 | 1553053.0 | 1193638.0 | 1282829.0 |
5028 | 1929999.0 | 2092322.0 | 1680948.0 | 1460215.0 | 1193638.0 |
5029 | 2503589.0 | 1929999.0 | 2092322.0 | 2799459.0 | 1460215.0 |
5030 | 4004600.0 | 2503589.0 | 1929999.0 | 4517255.0 | 2799459.0 |
755 rows × 5 columns
pd.DataFrame([metrics])
VOLUME_MAPE | VOLUME_MAE | VOLUME_RMSE | VOLUME_R2 | |
---|---|---|---|---|
0 | 0.230435 | 359508.613699 | 615230.295819 | 0.609123 |
Local saving and loading models#
You can save the trained model simply by calling a function save_model()
of the experiment:
exp.save_model('./model.cbm')
metrics = exp.calc_metrics()
metrics
{'VOLUME_MAPE': 0.23043529978382501,
'VOLUME_MAE': 359508.6136989899,
'VOLUME_RMSE': 615230.2958193648,
'VOLUME_R2': 0.6091232005321459}
To restore a model, we first initialize the experiment and prepare the data, and then call load_model()
instead of going through a model training stage.
exp = ml.build_experiment(config)
# data stage
exp.get_data()
exp.prepare_data()
# model load stage
model = exp.load_model(ml.LGBMRegressor(), './model.cbm')
# evaluate stage
predictions = exp.predict(model=model)
metrics = exp.calc_metrics()
metrics
{'VOLUME_MAPE': 0.23043529978382501,
'VOLUME_MAE': 359508.6136989899,
'VOLUME_RMSE': 615230.2958193648,
'VOLUME_R2': 0.6091232005321459}
MLFlow usage#
Save experiment to MLFlow#
Special attributes in experiment define MLFlow tracking capabilities.
log_models
:bool
– enable logging of the trained model.experiment_name
:str
– the name of the experiment.mlflow_url
:str
– MLFlow tracking URL used to log parameters, metrics and artifacts.
After running the whole cycle of an experiment, you can save it to MLFlow by calling .save_mlflow_run()
method.
class MLFlowLoggedExperiment(VolumePrediction):
general = {'log_models': True,
'experiment_name': 'example-experiment',
'mlflow_url': 'http://172.16.1.89:5000/'}
experiment = MLFlowLoggedExperiment()
metrics, predictions = experiment.run()
run_id = experiment.save_mlflow_run()
metrics
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/01/08 20:17:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 408
Created version '408' of model 'LGBMRegressor'.
2025/01/08 20:17:43 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2022-12-21; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'onetick-ml', 'onetick-py-test'}
Registered model 'wrapped_LGBMRegressor' already exists. Creating a new version of this model...
2025/01/08 20:17:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: wrapped_LGBMRegressor, version 408
Created version '408' of model 'wrapped_LGBMRegressor'.
{'VOLUME_MAPE': 0.23043529978382501,
'VOLUME_MAE': 359508.6136989899,
'VOLUME_RMSE': 615230.2958193648,
'VOLUME_R2': 0.6091232005321459}
Restore experiment from MLFlow#
We use run_id
produced in the previous step to call restore_experiment_from_mlflow()
utility function. This function reconstructs the experiment and restores the trained model.
# Load experiment from MLflow
experiment = ml.restore_experiment_from_mlflow(mlflow_url='http://172.16.1.89:5000/',
run_id=run_id)
# data stage
experiment.get_data()
experiment.prepare_data()
# we can skip model stage and go directly to prediction and metrics calculation
predictions = experiment.predict()
metrics = experiment.calc_metrics()
# metrics are the same, as in previous step
metrics
/builds/solutions/ml-ops/ds-framework/onetick/ml/utils/func.py:109: FutureWarning: ``mlflow.tracking.client.MlflowClient.download_artifacts`` is deprecated since 2.0. This method will be removed in a future release. Use ``mlflow.artifacts.download_artifacts`` instead.
config_path = client.download_artifacts(run_id, "config.yaml", tmpdirname)
/builds/solutions/ml-ops/ds-framework/onetick/ml/utils/func.py:112: FutureWarning: ``mlflow.tracking.client.MlflowClient.download_artifacts`` is deprecated since 2.0. This method will be removed in a future release. Use ``mlflow.artifacts.download_artifacts`` instead.
hashes_file = client.download_artifacts(run_id, "datahashes.yaml", tmpdirname)
{'VOLUME_MAPE': 0.23043529978382501,
'VOLUME_MAE': 359508.6136989899,
'VOLUME_RMSE': 615230.2958193648,
'VOLUME_R2': 0.6091232005321459}