Source code for bayesmark.stats

# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""General statistic tools useful in the benchmark.
"""
import numpy as np
import scipy.stats as sst


[docs]def robust_standardize(X, q_level=0.5): """Perform robust standardization of data matrix `X` over axis 0. Similar to :func:`sklearn:sklearn.preprocessing.robust_scale` except also does a Gaussian adjustment rescaling so that if Gaussian data is passed in the transformed data will, in large `n`, be distributed as N(0,1). See sklearn feature request #10139 on github. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n, ...) Array containing elements standardize. Require ``n >= 2``. q_level : scalar Must be in [0, 1]. Inter-quartile range to use for scale estimation. Returns ------- X : :class:`numpy:numpy.ndarray` of shape (n, ...) Elements of input `X` standardization. """ X = np.asarray(X) assert X.ndim in (1, 2) assert np.all(np.isfinite(X)) assert 0.0 < q_level and q_level <= 1.0 assert X.shape[0] >= 2 mu = np.median(X, axis=0) q0, q1 = 0.5 * (1.0 - q_level), 0.5 * (1.0 + q_level) v = np.percentile(X, 100 * q1, axis=0) - np.percentile(X, 100 * q0, axis=0) v = np.asarray(v) v[v == 0.0] = 1.0 X_ss = (X - mu) / v # Rescale to match scale of N(0,1) X_ss = X_ss * (sst.norm.ppf(q1) - sst.norm.ppf(q0)) assert X.shape == X_ss.shape return X_ss
[docs]def t_EB(x, alpha=0.05, axis=-1): """Get t-statistic based error bars on mean of `x`. Parameters ---------- x : :class:`numpy:numpy.ndarray` of shape (n_samples,) Data points to estimate mean. Must not be empty or contain ``NaN``. alpha : float The alpha level (``1-confidence``) probability (in (0, 1)) to construct confidence interval from t-statistic. axis : int The axis on `x` where we compute the t-statistics. The function is vectorized over all other dimensions. Returns ------- EB : float Size of error bar on mean (``>= 0``). The confidence interval is ``[mean(x) - EB, mean(x) + EB]``. `EB` is ``inf`` when ``len(x) <= 1``. Will be ``NaN`` if there are any infinite values in `x`. """ assert np.ndim(x) >= 1 and (not np.any(np.isnan(x))) assert np.ndim(alpha) == 0 assert 0.0 < alpha and alpha < 1.0 N = np.shape(x)[axis] if N <= 1: return np.full(np.sum(x, axis=axis).shape, fill_value=np.inf) confidence = 1 - alpha # loc cancels out when we just want EB anyway LB, UB = sst.t.interval(confidence, N - 1, loc=0.0, scale=1.0) assert not (LB > UB) # Just multiplying scale=ss.sem(x) is better for when scale=0 EB = 0.5 * sst.sem(x, axis=axis) * (UB - LB) return EB