Source code for bayesmark.quantiles

# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute quantiles and confidence intervals.
"""
import numpy as np
import scipy.stats as ss

from bayesmark.np_util import isclose_lte


def ensure_shape(x, y):
    """Util to broadcast on var to another but only when shape is different.

    This way we don't convert scalar into array type unnecessarily.
    """
    shape_y = np.shape(y)
    if np.shape(x) == shape_y:
        return x
    return np.broadcast_to(x, shape_y)


[docs]def order_stats(X): """Compute order statistics on sample `X`. Follows convention that order statistic 1 is minimum and statistic n is maximum. Therefore, array elements ``0`` and ``n+1`` are ``-inf`` and ``+inf``. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for order statistics. Can be vectorized. Must be sortable data type (which is almost everything). Returns ------- o_stats : :class:`numpy:numpy.ndarray` of shape (n+2,) Order statistics on `X`. """ assert np.ndim(X) >= 1 # NaN is not allowed since it does not have well defined order. assert not np.any(np.isnan(X)) X_shape = np.shape(X) inf_pad = np.full(X_shape[:-1] + (1,), np.inf) o_stats = np.concatenate((-inf_pad, np.sort(X, axis=-1), inf_pad), axis=-1) return o_stats
def _quantile(n, q): idx = np.ceil(n * q).astype(int) return idx
[docs]def quantile(X, q): """Computes `q` th quantile of `X`. Similar to :func:`numpy:numpy.percentile` except that it matches the mathematical definition of a quantile *and* `q` is scaled in (0,1) rather than (0,100). Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything). q : float Quantile to compute, must be in (0, 1). Can be vectorized. Returns ------- estimate : dtype of `X`, scalar Empirical `q` quantile from sample `X`. """ assert np.ndim(X) >= 1 # We could robustify things to allow the edge cases, but maybe later assert np.all(0 < q) and np.all(q < 1) # Currently don't support broadcasting both at same time assert np.ndim(X) == 1 or np.ndim(q) == 0 n = X.shape[-1] idx = _quantile(n, q) o_stats = order_stats(X) estimate = o_stats[..., idx] return estimate
def _quantile_CI(n, q, alpha): # Use in case there is -inf case from being at extreme of distn idx_lower = np.fmax(0, ss.binom.ppf(alpha / 2.0, n, q)).astype(int) assert np.all(isclose_lte(ss.binom.cdf(idx_lower - 1, n, q), alpha / 2.0)) assert np.all(isclose_lte(alpha / 2.0, ss.binom.cdf(idx_lower, n, q))) assert np.all(0 <= idx_lower) and np.all(idx_lower <= n + 1) idx_upper = np.fmax(0, ss.binom.isf(alpha / 2.0, n, q)).astype(int) + 1 assert np.all(isclose_lte(ss.binom.sf(idx_upper - 1, n, q), alpha / 2.0)) assert np.all(isclose_lte(alpha / 2.0, ss.binom.sf(idx_upper - 2, n, q))) assert np.all(isclose_lte(1 - (alpha / 2.0), ss.binom.cdf(idx_upper - 1, n, q))) assert np.all(isclose_lte(ss.binom.cdf(idx_upper - 2, n, q), 1 - (alpha / 2.0))) assert np.all(0 <= idx_upper) and np.all(idx_upper <= n + 1) C = ss.binom.cdf(idx_upper - 1, n, q) - ss.binom.cdf(idx_lower - 1, n, q) assert np.all(isclose_lte(1.0 - alpha, C)) return idx_lower, idx_upper
[docs]def quantile_CI(X, q, alpha=0.05): """Calculate CI on `q` quantile from same `X` using nonparametric estimation from order statistics. This will have alpha level of at most `alpha` due to the discrete nature of order statistics. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything). q : float Quantile to compute, must be in (0, 1). Can be vectorized. alpha : float False positive rate we allow for CI, must be in (0, 1). Can be vectorized. Returns ------- LB : dtype of `X`, scalar Lower end on CI UB : dtype of `X`, scalar Upper end on CI """ assert np.ndim(X) >= 1 # We could robustify things to allow the edge cases, but maybe later assert np.all(0 < q) and np.all(q < 1) assert np.all(0 < alpha) and np.all(alpha < 1) # Currently don't support broadcasting both at same time assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(alpha) == 0) n = X.shape[-1] idx_lower, idx_upper = _quantile_CI(n, q, alpha) o_stats = order_stats(X) LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper] return LB, UB
[docs]def max_quantile_CI(X, q, m, alpha=0.05): """Calculate CI on `q` quantile of distribution on max of `m` iid samples using a data set `X`. This uses nonparametric estimation from order statistics and will have alpha level of at most `alpha` due to the discrete nature of order statistics. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything). q : float Quantile to compute, must be in (0, 1). Can be vectorized. m : int Compute statistics for distribution on max over `m` samples. Must be ``>= 1``. Can be vectorized. alpha : float False positive rate we allow for CI, must be in (0, 1). Can be vectorized. Returns ------- estimate : dtype of `X`, scalar Best estimate on `q` quantile on max over `m` iid samples. LB : dtype of `X`, scalar Lower end on CI UB : dtype of `X`, scalar Upper end on CI """ # X and alpha used/checked below in quantile_CI routine. # We could robustify things to allow the edge cases, but maybe later assert np.all(0 < q) and np.all(q < 1) # Could check int but if someone wants to interpolate, we will let them. assert np.all(m >= 1) # Currently don't support broadcasting both at same time assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(q) == 0 and np.ndim(alpha) == 0) q = q ** (1.0 / m) o_stats = order_stats(X) n = X.shape[-1] idx = _quantile(n, q) idx_lower, idx_upper = _quantile_CI(n, q, alpha=alpha) LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper] # Might need to broadcast estimate out if vectorization is in alpha estimate = ensure_shape(o_stats[..., idx], LB) return estimate, LB, UB
[docs]def min_quantile_CI(X, q, m, alpha=0.05): """Calculate confidence interval on `q` quantile of distribution on min of `m` iid samples using a data set `X`. This uses nonparametric estimation from order statistics and will have alpha level of at most `alpha` due to the discrete nature of order statistics. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything). q : float Quantile to compute, must be in (0, 1). Can be vectorized. m : int Compute statistics for distribution on min over `m` samples. Must be ``>= 1``. Can be vectorized. alpha : float False positive rate we allow for CI, must be in (0, 1). Can be vectorized. Returns ------- estimate : dtype of `X`, scalar Best estimate on `q` quantile on min over `m` iid samples. LB : dtype of `X`, scalar Lower end on CI UB : dtype of `X`, scalar Upper end on CI """ # X and alpha used/checked below in quantile_CI routine. # We could robustify things to allow the edge cases, but maybe later assert np.all(0 < q) and np.all(q < 1) # Could check int but if someone wants to interp, we will let them. assert np.all(m >= 1) # Currently don't support broadcasting both at same time assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(q) == 0 and np.ndim(alpha) == 0) # This might have numerics issues for small q q = 1.0 - (1.0 - q) ** (1.0 / m) o_stats = order_stats(X) n = X.shape[-1] idx = _quantile(n, q) idx_lower, idx_upper = _quantile_CI(n, q, alpha=alpha) LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper] # Might need to broadcast estimate out if vectorization is in alpha estimate = ensure_shape(o_stats[..., idx], LB) return estimate, LB, UB
[docs]def quantile_and_CI(X, q, alpha=0.05): """Calculate CI on `q` quantile from same `X` using nonparametric estimation from order statistics. This will have alpha level of at most `alpha` due to the discrete nature of order statistics. Parameters ---------- X : :class:`numpy:numpy.ndarray` of shape (n,) Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything). q : float Quantile to compute, must be in (0, 1). Can be vectorized. alpha : float False positive rate we allow for CI, must be in (0, 1). Can be vectorized. Returns ------- estimate : dtype of `X`, scalar Empirical `q` quantile from sample `X`. LB : dtype of `X`, scalar Lower end on CI UB : dtype of `X`, scalar Upper end on CI """ # This routine is mostly just a wrapper routine estimate, LB, UB = max_quantile_CI(X, q=q, m=1, alpha=alpha) return estimate, LB, UB