Source code for bayesmark.sklearn_funcs

# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Routines to build a standardized interface to make `sklearn` hyper-parameter tuning problems look like an objective
function.

This file mostly contains a dictionary collection of all sklearn test funcs.

The format of each element in `MODELS` is:
model_name: (model_class, fixed_param_dict, search_param_api_dict)
`model_name` is an arbitrary name to refer to a certain strategy.
At usage time, the optimizer instance is created using:
``model_class(**kwarg_dict)``
The kwarg dict is `fixed_param_dict` + `search_param_dict`. The
`search_param_dict` comes from a optimizer which is configured using the
`search_param_api_dict`. See the API description for information on setting up
the `search_param_api_dict`.
"""
import os.path
import pickle as pkl
import warnings
from abc import ABC, abstractmethod

import numpy as np
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import get_scorer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from bayesmark.constants import ARG_DELIM, METRICS, MODEL_NAMES, VISIBLE_TO_OPT
from bayesmark.data import METRICS_LOOKUP, ProblemType, get_problem_type, load_data
from bayesmark.path_util import absopen
from bayesmark.space import JointSpace
from bayesmark.util import str_join_safe

# Using 3 would be faster, but 5 is the most realistic CV split (5-fold)
CV_SPLITS = 5

# We should add cat variables into some of these configurations but a lot of
# the wrappers for the BO methods really have trouble with cat types.

# kNN
knn_cfg = {
    "n_neighbors": {"type": "int", "space": "linear", "range": (1, 25)},
    "p": {"type": "int", "space": "linear", "range": (1, 4)},
}

# SVM
svm_cfg = {
    "C": {"type": "real", "space": "log", "range": (1.0, 1e3)},
    "gamma": {"type": "real", "space": "log", "range": (1e-4, 1e-3)},
    "tol": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
}

# DT
dt_cfg = {
    "max_depth": {"type": "int", "space": "linear", "range": (1, 15)},
    "min_samples_split": {"type": "real", "space": "logit", "range": (0.01, 0.99)},
    "min_samples_leaf": {"type": "real", "space": "logit", "range": (0.01, 0.49)},
    "min_weight_fraction_leaf": {"type": "real", "space": "logit", "range": (0.01, 0.49)},
    "max_features": {"type": "real", "space": "logit", "range": (0.01, 0.99)},
    "min_impurity_decrease": {"type": "real", "space": "linear", "range": (0.0, 0.5)},
}

# RF
rf_cfg = {
    "max_depth": {"type": "int", "space": "linear", "range": (1, 15)},
    "max_features": {"type": "real", "space": "logit", "range": (0.01, 0.99)},
    "min_samples_split": {"type": "real", "space": "logit", "range": (0.01, 0.99)},
    "min_samples_leaf": {"type": "real", "space": "logit", "range": (0.01, 0.49)},
    "min_weight_fraction_leaf": {"type": "real", "space": "logit", "range": (0.01, 0.49)},
    "min_impurity_decrease": {"type": "real", "space": "linear", "range": (0.0, 0.5)},
}

# MLP with ADAM
mlp_adam_cfg = {
    "hidden_layer_sizes": {"type": "int", "space": "linear", "range": (50, 200)},
    "alpha": {"type": "real", "space": "log", "range": (1e-5, 1e1)},
    "batch_size": {"type": "int", "space": "linear", "range": (10, 250)},
    "learning_rate_init": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
    "tol": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
    "validation_fraction": {"type": "real", "space": "logit", "range": (0.1, 0.9)},
    "beta_1": {"type": "real", "space": "logit", "range": (0.5, 0.99)},
    "beta_2": {"type": "real", "space": "logit", "range": (0.9, 1.0 - 1e-6)},
    "epsilon": {"type": "real", "space": "log", "range": (1e-9, 1e-6)},
}

# MLP with SGD
mlp_sgd_cfg = {
    "hidden_layer_sizes": {"type": "int", "space": "linear", "range": (50, 200)},
    "alpha": {"type": "real", "space": "log", "range": (1e-5, 1e1)},
    "batch_size": {"type": "int", "space": "linear", "range": (10, 250)},
    "learning_rate_init": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
    "power_t": {"type": "real", "space": "logit", "range": (0.1, 0.9)},
    "tol": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
    "momentum": {"type": "real", "space": "logit", "range": (0.001, 0.999)},
    "validation_fraction": {"type": "real", "space": "logit", "range": (0.1, 0.9)},
}

# AdaBoostClassifier
ada_cfg = {
    "n_estimators": {"type": "int", "space": "linear", "range": (10, 100)},
    "learning_rate": {"type": "real", "space": "log", "range": (1e-4, 1e1)},
}

# lasso
lasso_cfg = {
    "C": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
    "intercept_scaling": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
}

# linear
linear_cfg = {
    "C": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
    "intercept_scaling": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
}

MODELS_CLF = {
    "kNN": (KNeighborsClassifier, {}, knn_cfg),
    "SVM": (SVC, {"kernel": "rbf", "probability": True}, svm_cfg),
    "DT": (DecisionTreeClassifier, {"max_leaf_nodes": None}, dt_cfg),
    "RF": (RandomForestClassifier, {"n_estimators": 10, "max_leaf_nodes": None}, rf_cfg),
    "MLP-adam": (MLPClassifier, {"solver": "adam", "early_stopping": True}, mlp_adam_cfg),
    "MLP-sgd": (
        MLPClassifier,
        {"solver": "sgd", "early_stopping": True, "learning_rate": "invscaling", "nesterovs_momentum": True},
        mlp_sgd_cfg,
    ),
    "ada": (AdaBoostClassifier, {}, ada_cfg),
    "lasso": (
        LogisticRegression,
        {"penalty": "l1", "fit_intercept": True, "solver": "liblinear", "multi_class": "ovr"},
        lasso_cfg,
    ),
    "linear": (
        LogisticRegression,
        {"penalty": "l2", "fit_intercept": True, "solver": "liblinear", "multi_class": "ovr"},
        linear_cfg,
    ),
}

# For now, we will assume the default is to go thru all classifiers
assert sorted(MODELS_CLF.keys()) == sorted(MODEL_NAMES)

ada_cfg_reg = {
    "n_estimators": {"type": "int", "space": "linear", "range": (10, 100)},
    "learning_rate": {"type": "real", "space": "log", "range": (1e-4, 1e1)},
}

lasso_cfg_reg = {
    "alpha": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
    "fit_intercept": {"type": "bool"},
    "normalize": {"type": "bool"},
    "max_iter": {"type": "int", "space": "log", "range": (10, 5000)},
    "tol": {"type": "real", "space": "log", "range": (1e-5, 1e-1)},
    "positive": {"type": "bool"},
}

linear_cfg_reg = {
    "alpha": {"type": "real", "space": "log", "range": (1e-2, 1e2)},
    "fit_intercept": {"type": "bool"},
    "normalize": {"type": "bool"},
    "max_iter": {"type": "int", "space": "log", "range": (10, 5000)},
    "tol": {"type": "real", "space": "log", "range": (1e-4, 1e-1)},
}

MODELS_REG = {
    "kNN": (KNeighborsRegressor, {}, knn_cfg),
    "SVM": (SVR, {"kernel": "rbf"}, svm_cfg),
    "DT": (DecisionTreeRegressor, {"max_leaf_nodes": None}, dt_cfg),
    "RF": (RandomForestRegressor, {"n_estimators": 10, "max_leaf_nodes": None}, rf_cfg),
    "MLP-adam": (MLPRegressor, {"solver": "adam", "early_stopping": True}, mlp_adam_cfg),
    "MLP-sgd": (
        MLPRegressor,  # regression crashes often with relu
        {
            "activation": "tanh",
            "solver": "sgd",
            "early_stopping": True,
            "learning_rate": "invscaling",
            "nesterovs_momentum": True,
        },
        mlp_sgd_cfg,
    ),
    "ada": (AdaBoostRegressor, {}, ada_cfg_reg),
    "lasso": (Lasso, {}, lasso_cfg_reg),
    "linear": (Ridge, {"solver": "auto"}, linear_cfg_reg),
}

# If both classifiers and regressors match MODEL_NAMES then the experiment
# launcher can simply go thru the cartesian product and do all combos.
assert sorted(MODELS_REG.keys()) == sorted(MODEL_NAMES)


[docs]class TestFunction(ABC): """Abstract base class for test functions in the benchmark. These do not need to be ML hyper-parameter tuning. """ def __init__(self): """Setup general test function for benchmark. We assume the test function knows the meta-data about the search space, but is also stateless to fit modeling assumptions. To keep stateless, it does not do things like count the number of function evaluations. """ # This will need to be set before using other routines self.api_config = None
[docs] @abstractmethod def evaluate(self, params): """Abstract method to evaluate the function at a parameter setting. """
[docs] def get_api_config(self): """Get the API config for this test problem. Returns ------- api_config : dict(str, dict(str, object)) The API config for the used model. See README for API description. """ assert self.api_config is not None, "API config is not set." return self.api_config
[docs]class SklearnModel(TestFunction): """Test class for sklearn classifier/regressor CV score objective functions. """ # Map our short names for metrics to the full length sklearn name _METRIC_MAP = { "nll": "neg_log_loss", "acc": "accuracy", "mae": "neg_mean_absolute_error", "mse": "neg_mean_squared_error", } # This can be static and constant for now objective_names = (VISIBLE_TO_OPT, "generalization") def __init__(self, model, dataset, metric, shuffle_seed=0, data_root=None): """Build class that wraps sklearn classifier/regressor CV score for use as an objective function. Parameters ---------- model : str Which classifier to use, must be key in `MODELS_CLF` or `MODELS_REG` dict depending on if dataset is classification or regression. dataset : str Which data set to use, must be key in `DATA_LOADERS` dict, or name of custom csv file. metric : str Which sklearn scoring metric to use, in `SCORERS_CLF` list or `SCORERS_REG` dict depending on if dataset is classification or regression. shuffle_seed : int Random seed to use when splitting the data into train and validation in the cross-validation splits. This is needed in order to keep the split constant across calls. Otherwise there would be extra noise in the objective function for varying splits. data_root : str Root directory to look for all custom csv files. """ TestFunction.__init__(self) data, target, problem_type = load_data(dataset, data_root=data_root) assert problem_type in (ProblemType.clf, ProblemType.reg) self.is_classifier = problem_type == ProblemType.clf # Do some validation on loaded data assert isinstance(data, np.ndarray) assert isinstance(target, np.ndarray) assert data.ndim == 2 and target.ndim == 1 assert data.shape[0] == target.shape[0] assert data.size > 0 assert data.dtype == np.float_ assert np.all(np.isfinite(data)) # also catch nan assert target.dtype == (np.int_ if self.is_classifier else np.float_) assert np.all(np.isfinite(target)) # also catch nan model_lookup = MODELS_CLF if self.is_classifier else MODELS_REG base_model, fixed_params, api_config = model_lookup[model] # New members for model self.base_model = base_model self.fixed_params = fixed_params self.api_config = api_config # Always shuffle your data to be safe. Use fixed seed for reprod. self.data_X, self.data_Xt, self.data_y, self.data_yt = train_test_split( data, target, test_size=0.2, random_state=shuffle_seed, shuffle=True ) assert metric in METRICS, "Unknown metric %s" % metric assert metric in METRICS_LOOKUP[problem_type], "Incompatible metric %s with problem type %s" % ( metric, problem_type, ) self.scorer = get_scorer(SklearnModel._METRIC_MAP[metric])
[docs] def evaluate(self, params): """Evaluate the sklearn CV objective at a particular parameter setting. Parameters ---------- params : dict(str, object) The varying (non-fixed) parameter dict to the sklearn model. Returns ------- cv_loss : float Average loss over CV splits for sklearn model when tested using the settings in params. """ params = dict(params) # copy to avoid modification of original params.update(self.fixed_params) # add in fixed params # now build the skl object clf = self.base_model(**params) assert np.all(np.isfinite(self.data_X)), "all features must be finite" assert np.all(np.isfinite(self.data_y)), "all targets must be finite" # Do the x-val, ignore user warn since we expect BO to try weird stuff with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) S = cross_val_score(clf, self.data_X, self.data_y, scoring=self.scorer, cv=CV_SPLITS) # Take the mean score across all x-val splits cv_score = np.mean(S) # Now let's get the generalization error for same hypers clf = self.base_model(**params) clf.fit(self.data_X, self.data_y) generalization_score = self.scorer(clf, self.data_Xt, self.data_yt) # get_scorer makes everything a score not a loss, so we need to negate to get the loss back cv_loss = -cv_score assert np.isfinite(cv_loss), "loss not even finite" generalization_loss = -generalization_score assert np.isfinite(generalization_loss), "loss not even finite" # Unbox to basic float to keep it simple cv_loss = cv_loss.item() assert isinstance(cv_loss, float) generalization_loss = generalization_loss.item() assert isinstance(generalization_loss, float) # For now, score with same objective. We can later add generalization error return cv_loss, generalization_loss
[docs] @staticmethod def test_case_str(model, dataset, scorer): """Generate the combined test case string from model, dataset, and scorer combination.""" test_case = str_join_safe(ARG_DELIM, (model, dataset, scorer)) return test_case
[docs] @staticmethod def inverse_test_case_str(test_case): """Inverse of `test_case_str`.""" model, dataset, scorer = test_case.split(ARG_DELIM) assert test_case == SklearnModel.test_case_str(model, dataset, scorer) return model, dataset, scorer
[docs]class SklearnSurrogate(TestFunction): """Test class for sklearn classifier/regressor CV score objective function surrogates. """ # This can be static and constant for now objective_names = (VISIBLE_TO_OPT, "generalization") def __init__(self, model, dataset, scorer, path): """Build class that wraps sklearn classifier/regressor CV score for use as an objective function surrogate. Parameters ---------- model : str Which classifier to use, must be key in `MODELS_CLF` or `MODELS_REG` dict depending on if dataset is classification or regression. dataset : str Which data set to use, must be key in `DATA_LOADERS` dict, or name of custom csv file. scorer : str Which sklearn scoring metric to use, in `SCORERS_CLF` list or `SCORERS_REG` dict depending on if dataset is classification or regression. path : str Root directory to look for all pickle files. """ TestFunction.__init__(self) # Find the space class, we could consider putting this in pkl too problem_type = get_problem_type(dataset) assert problem_type in (ProblemType.clf, ProblemType.reg) _, _, self.api_config = MODELS_CLF[model] if problem_type == ProblemType.clf else MODELS_REG[model] self.space = JointSpace(self.api_config) # Load the pre-trained model fname = SklearnModel.test_case_str(model, dataset, scorer) + ".pkl" if isinstance(path, bytes): # This is for test-ability, we could use mock instead. self.model = pkl.loads(path) else: path = os.path.join(path, fname) # pragma: io assert os.path.isfile(path), "Model file not found: %s" % path with absopen(path, "rb") as f: # pragma: io self.model = pkl.load(f) # pragma: io assert callable(getattr(self.model, "predict", None))
[docs] def evaluate(self, params): """Evaluate the sklearn CV objective at a particular parameter setting. Parameters ---------- params : dict(str, object) The varying (non-fixed) parameter dict to the sklearn model. Returns ------- overall_loss : float Average loss over CV splits for sklearn model when tested using the settings in params. """ x = self.space.warp([params]) y, = self.model.predict(x) assert y.shape == (len(self.objective_names),) assert y.dtype.kind == "f" assert np.all(-np.inf < y) # Will catch nan too y = tuple(y.tolist()) # Make consistent with SklearnModel typing return y