Source code for bayesmark.stats

# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""General statistic tools useful in the benchmark.
"""
import numpy as np
import scipy.stats as sst


[docs]def robust_standardize(X, q_level=0.5):
    """Perform robust standardization of data matrix `X` over axis 0.

    Similar to :func:`sklearn:sklearn.preprocessing.robust_scale` except also does a Gaussian
    adjustment rescaling so that if Gaussian data is passed in the transformed
    data will, in large `n`, be distributed as N(0,1). See sklearn feature
    request #10139 on github.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n, ...)
        Array containing elements standardize. Require ``n >= 2``.
    q_level : scalar
        Must be in [0, 1]. Inter-quartile range to use for scale estimation.

    Returns
    -------
    X : :class:`numpy:numpy.ndarray` of shape (n, ...)
        Elements of input `X` standardization.
    """
    X = np.asarray(X)
    assert X.ndim in (1, 2)
    assert np.all(np.isfinite(X))
    assert 0.0 < q_level and q_level <= 1.0
    assert X.shape[0] >= 2

    mu = np.median(X, axis=0)

    q0, q1 = 0.5 * (1.0 - q_level), 0.5 * (1.0 + q_level)
    v = np.percentile(X, 100 * q1, axis=0) - np.percentile(X, 100 * q0, axis=0)
    v = np.asarray(v)
    v[v == 0.0] = 1.0

    X_ss = (X - mu) / v
    # Rescale to match scale of N(0,1)
    X_ss = X_ss * (sst.norm.ppf(q1) - sst.norm.ppf(q0))
    assert X.shape == X_ss.shape
    return X_ss


[docs]def t_EB(x, alpha=0.05, axis=-1):
    """Get t-statistic based error bars on mean of `x`.

    Parameters
    ----------
    x : :class:`numpy:numpy.ndarray` of shape (n_samples,)
        Data points to estimate mean. Must not be empty or contain ``NaN``.
    alpha : float
        The alpha level (``1-confidence``) probability (in (0, 1)) to construct confidence interval from t-statistic.
    axis : int
        The axis on `x` where we compute the t-statistics. The function is vectorized over all other dimensions.

    Returns
    -------
    EB : float
        Size of error bar on mean (``>= 0``). The confidence interval is ``[mean(x) - EB, mean(x) + EB]``. `EB` is
        ``inf`` when ``len(x) <= 1``. Will be ``NaN`` if there are any infinite values in `x`.
    """
    assert np.ndim(x) >= 1 and (not np.any(np.isnan(x)))
    assert np.ndim(alpha) == 0
    assert 0.0 < alpha and alpha < 1.0

    N = np.shape(x)[axis]
    if N <= 1:
        return np.full(np.sum(x, axis=axis).shape, fill_value=np.inf)

    confidence = 1 - alpha
    # loc cancels out when we just want EB anyway
    LB, UB = sst.t.interval(confidence, N - 1, loc=0.0, scale=1.0)
    assert not (LB > UB)
    # Just multiplying scale=ss.sem(x) is better for when scale=0
    EB = 0.5 * sst.sem(x, axis=axis) * (UB - LB)
    return EB