Conformal Predictive Systems¶

`online_cp.CPS.RidgePredictionMachine` ¶

Bases: ConformalPredictiveSystem

Conformal predictive system based on ridge regression ([ALRW2 §7.1]).

The Ridge Prediction Machine (RPM) is the predictive-distribution analogue of :class:~online_cp.regressors.ConformalRidgeRegressor: it uses the studentised ridge residual as its conformity measure and, because that score is monotone in the hypothesised label, inverts it in closed form to produce a full conformal predictive distribution rather than a single interval.

Parameters:

Name	Type	Description	Default
`a`	`float`	Ridge regularisation parameter (default 0).	`0`
`warnings`	`bool`	Whether to raise warnings on rank-deficient matrices (default True).	`True`
`autotune`	`bool`	Whether to automatically tune the ridge parameter via GCV (default False).	`False`
`verbose`	`int`	Verbosity level (default 0).	`0`
`epsilon`	`float`	Default significance level (default 0.1).	`default_epsilon`

Source code in src/online_cp/CPS.py

class RidgePredictionMachine(ConformalPredictiveSystem):
    """Conformal predictive system based on ridge regression ([ALRW2 §7.1]).

    The Ridge Prediction Machine (RPM) is the predictive-distribution analogue
    of :class:`~online_cp.regressors.ConformalRidgeRegressor`: it uses the
    studentised ridge residual as its conformity measure and, because that score
    is monotone in the hypothesised label, inverts it in closed form to produce
    a full conformal predictive distribution rather than a single interval.

    Parameters
    ----------
    a : float, optional
        Ridge regularisation parameter (default 0).
    warnings : bool, optional
        Whether to raise warnings on rank-deficient matrices (default True).
    autotune : bool, optional
        Whether to automatically tune the ridge parameter via GCV (default False).
    verbose : int, optional
        Verbosity level (default 0).
    epsilon : float, optional
        Default significance level (default 0.1).
    """

    _SAVE_PARAMS: tuple = ("a", "warnings", "autotune", "verbose", "epsilon")
    _SAVE_STATE: tuple = ("X", "y", "p", "Id", "XTXinv")

    def __init__(self, a=0, warnings=True, autotune=False, verbose=0, epsilon=default_epsilon):
        super().__init__(epsilon=epsilon)
        self.a = a
        self.X = None
        self.y = None
        self.p = None
        self.Id = None
        self.XTXinv = None

        # Should we raise warnings
        self.warnings = warnings
        # Do we autotune ridge parameter on warning
        self.autotune = autotune

        self.verbose = verbose

    def learn_initial_training_set(self, X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None:
        self.X = X
        self.y = y
        self.p = X.shape[1]
        self.Id = np.identity(self.p)
        if self.autotune:
            self._tune_ridge_parameter()
        else:
            self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)

    def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
        """
        Learn a single example. If we have already computed X and XTXinv, use them for update. Then the last row of X is the object with label y.
        >>> cps = RidgePredictionMachine()
        >>> cps.learn_one(np.array([1, 0]), 1)
        >>> cps.X
        array([[1, 0]])
        >>> cps.y
        array([1])
        """
        # Learn label y
        if self.y is None:
            self.y = np.array([y])
        else:
            self.y = np.append(self.y, y)

        if precomputed is not None:
            X = precomputed["X"]
            XTXinv = precomputed["XTXinv"]

            if X is not None:
                self.X = X
                self.p = self.X.shape[1]
                self.Id = np.identity(self.p)

            if XTXinv is not None:
                self.XTXinv = XTXinv

            else:
                if self.X.shape[0] == 1:
                    self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)
                else:
                    # Update (X^T X + aI)^{-1} via Sherman-Morrison
                    self.XTXinv -= (self.XTXinv @ np.outer(x, x) @ self.XTXinv) / (1 + x.T @ self.XTXinv @ x)

                    # Check the rank
                    if self.warnings:
                        self.check_matrix_rank(self.XTXinv)

        else:
            # Learn object x
            if self.X is None:
                self.X = x.reshape(1, -1)
                self.p = self.X.shape[1]
                self.Id = np.identity(self.p)
            elif self.X.shape[0] == 1:
                self.X = np.append(self.X, x.reshape(1, -1), axis=0)
                self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)
            else:
                self.X = np.append(self.X, x.reshape(1, -1), axis=0)
                # Update (X^T X + aI)^{-1} via Sherman-Morrison
                self.XTXinv -= (self.XTXinv @ np.outer(x, x) @ self.XTXinv) / (1 + x.T @ self.XTXinv @ x)

                # Check the rank
                if self.warnings:
                    self.check_matrix_rank(self.XTXinv)

    def change_ridge_parameter(self, a):
        """
        Change the ridge parameter
        >>> cps = RidgePredictionMachine()
        >>> cps.learn_one(np.array([1, 0]), 1)
        >>> cps.change_ridge_parameter(1)
        >>> cps.a
        1
        """
        self.a = a
        if self.X is not None:
            self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)

    def check_matrix_rank(self, M):
        """
        Check if a matrix has full rank <==> is invertible
        Returns False if matrix is rank deficient
        NOTE In numerical linear algebra it is a bit more subtle. The condition number can tell us more.

        >>> cps = RidgePredictionMachine(warnings=False)
        >>> cps.check_matrix_rank(np.array([[1, 0], [1, 0]]))
        False
        >>> cps.check_matrix_rank(np.array([[1, 0], [0, 1]]))
        True
        """
        if np.linalg.matrix_rank(M) < M.shape[0]:
            if self.warnings:
                warnings.warn(
                    f"The matrix X is rank deficient. Condition number: {np.linalg.cond(M)}. Consider changing the ridge parameter",
                    stacklevel=2,
                )
            return False
        else:
            return True

    def _tune_ridge_parameter(self, a0=None):
        """
        Tune ridge parameter with Generalized cross validation https://pages.stat.wisc.edu/~wahba/stat860public/pdf1/golub.heath.wahba.pdf
        """
        XTX = self.X.T @ self.X
        n = self.X.shape[0]
        In = np.identity(n)

        def GCV(a):
            try:
                A = self.X @ np.linalg.inv(XTX + a * self.Id) @ self.X.T
                max_diag_H = np.max(np.diag(A))  # Maximum diagonal element of the hat matrix
                if max_diag_H > 1:
                    return np.inf
                return (1 / n) * np.linalg.norm((In - A) @ self.y) ** 2 / ((1 / n) * np.trace(In - A)) ** 2
            except (np.linalg.LinAlgError, ZeroDivisionError):
                return np.inf

        # Initial guess
        if a0 is None:
            a0 = 1e-6  # Small perturbation to avoid numerical issues

        # Bounds to ensure a >= 0
        res = minimize(
            GCV, x0=a0, bounds=Bounds(lb=1e-6, keep_feasible=True)
        )  # May be relevant to pass some arguments here, or even use another minimizer.
        a = res.x[0]

        if self.verbose > 0:
            print(f"New ridge parameter: {a}")
        self.change_ridge_parameter(a)

    def predict_cpd(self, x, return_update=False):
        def build_precomputed(X, XTXinv):
            computed = {
                "X": X,  # The updated matrix of objects
                "XTXinv": XTXinv,  # The updated inverse
            }
            return computed

        # Add row to X matrix
        X = np.append(self.X, x.reshape(1, -1), axis=0)
        n = X.shape[0]
        y = self.y

        # Update XTX_inv via Sherman-Morrison formula
        XTXinv = self.XTXinv - (self.XTXinv @ np.outer(x, x) @ self.XTXinv) / (1 + x.T @ self.XTXinv @ x)

        # Efficient computation avoiding full O(n²d) hat matrix.
        # Only compute the diagonal, last row, and H[:-1,:-1]@y in O(nd²).
        XTXinv_x = XTXinv @ x  # (d,)   — O(d²)
        h = np.sum((X @ XTXinv) * X, axis=1)  # diag(H) — O(nd²)
        h_last_row = X[:-1] @ XTXinv_x  # H[-1, :-1] — O(nd)
        Hy = X[:-1] @ (XTXinv @ (X[:-1].T @ y))  # H[:-1,:-1] @ y — O(nd)

        sqrt_one_minus_h = np.sqrt(1 - h[:-1])
        A = np.dot(h_last_row, y) / np.sqrt(1 - h[-1]) + (y - Hy) / sqrt_one_minus_h
        B = np.sqrt(1 - h[-1]) * np.ones(n - 1) + h_last_row / sqrt_one_minus_h
        C = np.zeros(n + 1)
        C[1:-1] = A / B
        C[0] = -np.inf
        C[-1] = np.inf
        C.sort()

        cpd = RidgePredictiveDistributionFunction(C=C, epsilon=self.epsilon)

        if return_update:
            return cpd, build_precomputed(X, XTXinv)
        else:
            return cpd

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

Learn a single example. If we have already computed X and XTXinv, use them for update. Then the last row of X is the object with label y.

cps = RidgePredictionMachine() cps.learn_one(np.array([1, 0]), 1) cps.X array([[1, 0]]) cps.y array([1])

Source code in src/online_cp/CPS.py

def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
    """
    Learn a single example. If we have already computed X and XTXinv, use them for update. Then the last row of X is the object with label y.
    >>> cps = RidgePredictionMachine()
    >>> cps.learn_one(np.array([1, 0]), 1)
    >>> cps.X
    array([[1, 0]])
    >>> cps.y
    array([1])
    """
    # Learn label y
    if self.y is None:
        self.y = np.array([y])
    else:
        self.y = np.append(self.y, y)

    if precomputed is not None:
        X = precomputed["X"]
        XTXinv = precomputed["XTXinv"]

        if X is not None:
            self.X = X
            self.p = self.X.shape[1]
            self.Id = np.identity(self.p)

        if XTXinv is not None:
            self.XTXinv = XTXinv

        else:
            if self.X.shape[0] == 1:
                self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)
            else:
                # Update (X^T X + aI)^{-1} via Sherman-Morrison
                self.XTXinv -= (self.XTXinv @ np.outer(x, x) @ self.XTXinv) / (1 + x.T @ self.XTXinv @ x)

                # Check the rank
                if self.warnings:
                    self.check_matrix_rank(self.XTXinv)

    else:
        # Learn object x
        if self.X is None:
            self.X = x.reshape(1, -1)
            self.p = self.X.shape[1]
            self.Id = np.identity(self.p)
        elif self.X.shape[0] == 1:
            self.X = np.append(self.X, x.reshape(1, -1), axis=0)
            self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)
        else:
            self.X = np.append(self.X, x.reshape(1, -1), axis=0)
            # Update (X^T X + aI)^{-1} via Sherman-Morrison
            self.XTXinv -= (self.XTXinv @ np.outer(x, x) @ self.XTXinv) / (1 + x.T @ self.XTXinv @ x)

            # Check the rank
            if self.warnings:
                self.check_matrix_rank(self.XTXinv)

`change_ridge_parameter(a)` ¶

Change the ridge parameter

cps = RidgePredictionMachine() cps.learn_one(np.array([1, 0]), 1) cps.change_ridge_parameter(1) cps.a 1

Source code in src/online_cp/CPS.py

def change_ridge_parameter(self, a):
    """
    Change the ridge parameter
    >>> cps = RidgePredictionMachine()
    >>> cps.learn_one(np.array([1, 0]), 1)
    >>> cps.change_ridge_parameter(1)
    >>> cps.a
    1
    """
    self.a = a
    if self.X is not None:
        self.XTXinv = np.linalg.inv(self.X.T @ self.X + self.a * self.Id)

`check_matrix_rank(M)` ¶

Check if a matrix has full rank <==> is invertible Returns False if matrix is rank deficient NOTE In numerical linear algebra it is a bit more subtle. The condition number can tell us more.

cps = RidgePredictionMachine(warnings=False) cps.check_matrix_rank(np.array([[1, 0], [1, 0]])) False cps.check_matrix_rank(np.array([[1, 0], [0, 1]])) True

Source code in src/online_cp/CPS.py

def check_matrix_rank(self, M):
    """
    Check if a matrix has full rank <==> is invertible
    Returns False if matrix is rank deficient
    NOTE In numerical linear algebra it is a bit more subtle. The condition number can tell us more.

    >>> cps = RidgePredictionMachine(warnings=False)
    >>> cps.check_matrix_rank(np.array([[1, 0], [1, 0]]))
    False
    >>> cps.check_matrix_rank(np.array([[1, 0], [0, 1]]))
    True
    """
    if np.linalg.matrix_rank(M) < M.shape[0]:
        if self.warnings:
            warnings.warn(
                f"The matrix X is rank deficient. Condition number: {np.linalg.cond(M)}. Consider changing the ridge parameter",
                stacklevel=2,
            )
        return False
    else:
        return True

`online_cp.CPS.KernelRidgePredictionMachine` ¶

Bases: ConformalPredictiveSystem

This conformal predictive system uses the "studentised residuals" as conformity measure. Algorithm 7.3 in Algorithmic Learning in a Random World (2nd edition).

Source code in src/online_cp/CPS.py

class KernelRidgePredictionMachine(ConformalPredictiveSystem):
    """
    This conformal predictive system uses the "studentised residuals" as conformity measure.
    Algorithm 7.3 in Algorithmic Learning in a Random World (2nd edition).
    """

    _SAVE_PARAMS: tuple = ("kernel", "a", "autotune", "verbose", "epsilon")
    _SAVE_STATE: tuple = ("X", "y", "K", "Kinv", "h_diag", "Hy")
    _SAVE_CALLABLES: tuple = ("kernel",)

    def __init__(self, kernel, a=0, autotune=False, verbose=0, epsilon=default_epsilon):
        super().__init__(epsilon=epsilon)

        self.kernel = kernel

        self.a = a
        self.X = None
        self.y = None

        # Do we autotune ridge parameter on warning
        self.autotune = autotune

        self.verbose = verbose

    def learn_initial_training_set(self, X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None:
        self.X = X
        self.y = y
        self.K = self.kernel(self.X)
        if self.autotune:
            self._tune_ridge_parameter()
        else:
            Id = np.identity(self.X.shape[0])
            self.Kinv = np.linalg.inv(self.K + self.a * Id)
            H = self.K @ self.Kinv
            self.h_diag = H.diagonal().copy()
            self.Hy = H @ y

    def _tune_ridge_parameter(self, a0=None):
        """
        Tune ridge parameter with Generalized Cross Validation (GCV) in the kernel space.
        """
        n = self.K.shape[0]
        In = np.identity(n)

        def GCV(a):
            try:
                A = self.K @ np.linalg.inv(self.K + a * In)
                max_diag_H = np.max(np.diag(A))  # Maximum diagonal element of the hat matrix
                if max_diag_H > 1:
                    return np.inf
                return (1 / n) * np.linalg.norm((In - A) @ self.y) ** 2 / ((1 / n) * np.trace(In - A)) ** 2
            except (np.linalg.LinAlgError, ZeroDivisionError):
                return np.inf

        # Initial guess
        if a0 is None:
            a0 = 1e-6  # Small perturbation to avoid numerical issues

        # Bounds to ensure a >= 0
        res = minimize(GCV, x0=a0, bounds=Bounds(lb=1e-6, keep_feasible=True))
        a = res.x[0]

        if self.verbose > 0:
            print(f"New ridge parameter: {a}")
        self.change_ridge_parameter(a)

    def change_ridge_parameter(self, a):
        """
        Change the ridge parameter and recompute cached intermediates.
        """
        self.a = a
        if self.X is not None:
            Id = np.identity(self.X.shape[0])

            self.K = self.kernel(self.X)
            self.Kinv = np.linalg.inv(self.K + self.a * Id)
            H = self.K @ self.Kinv
            self.h_diag = H.diagonal().copy()
            self.Hy = H @ self.y

    def _update_Kinv(self, Kinv, k, d):
        return np.block([[Kinv + d * Kinv @ k @ k.T @ Kinv, -d * Kinv @ k], [-d * k.T @ Kinv, d]])

    @staticmethod
    def _update_K(K, k, kappa):
        return np.block([[K, k], [k.T, kappa]])

    def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
        """
        Learn a single example
        """
        x = np.atleast_2d(x)
        # Learn label y
        if self.y is None:
            self.y = np.array([y])
        else:
            self.y = np.append(self.y, y)

        if precomputed is not None:
            # Incremental update of h_diag and Hy from precomputed intermediates
            v = precomputed["v"]
            d_val = precomputed["d"]
            k = precomputed["k"]
            kappa = precomputed["kappa"]
            a_d = self.a * d_val

            # h_diag_new[:-1] = h_diag_old - a_d * v^2
            # h_diag_new[-1] = 1 - a_d  (= d*kappa - d*k^T v = d*(kappa - k^T Kinv k))
            v_flat = v.ravel()
            new_h_last = (d_val * kappa - d_val * (k.T @ v)).item()
            self.h_diag = np.append(self.h_diag - a_d * v_flat**2, new_h_last)

            # Hy_new[:-1] = Hy_old - a_d * v * (v^T @ y_old) + a_d * v * y_new
            #             = Hy_old + a_d * v * (y_new - v^T @ y_old)
            # Hy_new[-1]  = a_d * (v^T @ y_old) + new_h_last * y_new
            # But y_new is the label we just appended: self.y[-1] = y
            y_old = self.y[:-1]
            vTy_old = float(v_flat @ y_old)
            self.Hy = np.append(self.Hy + a_d * v_flat * (y - vTy_old), a_d * vTy_old + new_h_last * y)

            self.K = self._update_K(self.K, k, kappa)
            self.Kinv = self._update_Kinv(self.Kinv, k, d_val)
            self.X = np.append(self.X, x.reshape(1, -1), axis=0)
        else:
            if self.X is None:
                self.X = x.reshape(1, -1)
                Id = np.identity(self.X.shape[0])
                self.K = self.kernel(self.X)
                self.Kinv = np.linalg.inv(self.K + self.a * Id)
                H = self.K @ self.Kinv
                self.h_diag = H.diagonal().copy()
                self.Hy = H @ self.y
            else:
                k = self.kernel(self.X, x).reshape(-1, 1)
                kappa = self.kernel(x, x)
                d_val = (1 / (kappa + self.a - k.T @ self.Kinv @ k)).item()
                a_d = self.a * d_val

                # Compute v = Kinv @ k
                v = self.Kinv @ k  # (n, 1)
                v_flat = v.ravel()

                # Incremental update of h_diag and Hy
                new_h_last = (d_val * kappa - d_val * (k.T @ v)).item()
                self.h_diag = np.append(self.h_diag - a_d * v_flat**2, new_h_last)

                y_old = self.y[:-1]
                vTy_old = float(v_flat @ y_old)
                self.Hy = np.append(self.Hy + a_d * v_flat * (y - vTy_old), a_d * vTy_old + new_h_last * y)

                self.K = self._update_K(self.K, k, kappa)
                self.Kinv = self._update_Kinv(self.Kinv, k, d_val)
                self.X = np.append(self.X, x.reshape(1, -1), axis=0)

    def predict_cpd(self, x, return_update=False):

        def build_precomputed(v, d_val, k, kappa):
            computed = {
                "v": v,
                "d": d_val,
                "k": k,
                "kappa": kappa,
            }
            return computed

        x = np.atleast_2d(x)
        # Temporarily update kernel matrix
        k = self.kernel(self.X, x).reshape(-1, 1)
        kappa = self.kernel(x, x)
        d_val = (1 / (kappa + self.a - k.T @ self.Kinv @ k)).item()
        a_d = self.a * d_val
        y = self.y

        # Compute v = Kinv @ k (the key intermediate)
        v = self.Kinv @ k  # (n, 1)
        v_flat = v.ravel()

        # Efficient O(n) computation — avoid forming full (n+1)×(n+1) hat matrix.
        # H_new diagonal: h_diag_new[:-1] = h_diag_old - a_d * v^2, h_new[-1] = d*kappa - d*k^T*v
        h_train = self.h_diag - a_d * v_flat**2
        h_last = (d_val * kappa - d_val * (k.T @ v)).item()

        # H_new last row (= last col by symmetry): H[-1, :-1] = a_d * v^T
        h_last_row = a_d * v_flat

        # H_new[:-1,:-1] @ y = Hy_old - a_d * v * (v^T @ y)
        Hy_train = self.Hy - a_d * v_flat * float(v_flat @ y)

        n = len(y) + 1  # augmented size

        sqrt_one_minus_h = np.sqrt(1 - h_train)
        A = np.dot(h_last_row, y) / np.sqrt(1 - h_last) + (y - Hy_train) / sqrt_one_minus_h
        B = np.sqrt(1 - h_last) * np.ones(n - 1) + h_last_row / sqrt_one_minus_h

        C = np.zeros(n + 1)
        C[1:-1] = A / B
        C[0] = -np.inf
        C[-1] = np.inf
        assert not np.isnan(C).any(), "C contains NaN values"
        C.sort()

        cpd = RidgePredictiveDistributionFunction(C=C, epsilon=self.epsilon)

        if return_update:
            return cpd, build_precomputed(v, d_val, k, kappa)
        else:
            return cpd

`change_ridge_parameter(a)` ¶

Change the ridge parameter and recompute cached intermediates.

Source code in src/online_cp/CPS.py

def change_ridge_parameter(self, a):
    """
    Change the ridge parameter and recompute cached intermediates.
    """
    self.a = a
    if self.X is not None:
        Id = np.identity(self.X.shape[0])

        self.K = self.kernel(self.X)
        self.Kinv = np.linalg.inv(self.K + self.a * Id)
        H = self.K @ self.Kinv
        self.h_diag = H.diagonal().copy()
        self.Hy = H @ self.y

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

Learn a single example

Source code in src/online_cp/CPS.py

def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
    """
    Learn a single example
    """
    x = np.atleast_2d(x)
    # Learn label y
    if self.y is None:
        self.y = np.array([y])
    else:
        self.y = np.append(self.y, y)

    if precomputed is not None:
        # Incremental update of h_diag and Hy from precomputed intermediates
        v = precomputed["v"]
        d_val = precomputed["d"]
        k = precomputed["k"]
        kappa = precomputed["kappa"]
        a_d = self.a * d_val

        # h_diag_new[:-1] = h_diag_old - a_d * v^2
        # h_diag_new[-1] = 1 - a_d  (= d*kappa - d*k^T v = d*(kappa - k^T Kinv k))
        v_flat = v.ravel()
        new_h_last = (d_val * kappa - d_val * (k.T @ v)).item()
        self.h_diag = np.append(self.h_diag - a_d * v_flat**2, new_h_last)

        # Hy_new[:-1] = Hy_old - a_d * v * (v^T @ y_old) + a_d * v * y_new
        #             = Hy_old + a_d * v * (y_new - v^T @ y_old)
        # Hy_new[-1]  = a_d * (v^T @ y_old) + new_h_last * y_new
        # But y_new is the label we just appended: self.y[-1] = y
        y_old = self.y[:-1]
        vTy_old = float(v_flat @ y_old)
        self.Hy = np.append(self.Hy + a_d * v_flat * (y - vTy_old), a_d * vTy_old + new_h_last * y)

        self.K = self._update_K(self.K, k, kappa)
        self.Kinv = self._update_Kinv(self.Kinv, k, d_val)
        self.X = np.append(self.X, x.reshape(1, -1), axis=0)
    else:
        if self.X is None:
            self.X = x.reshape(1, -1)
            Id = np.identity(self.X.shape[0])
            self.K = self.kernel(self.X)
            self.Kinv = np.linalg.inv(self.K + self.a * Id)
            H = self.K @ self.Kinv
            self.h_diag = H.diagonal().copy()
            self.Hy = H @ self.y
        else:
            k = self.kernel(self.X, x).reshape(-1, 1)
            kappa = self.kernel(x, x)
            d_val = (1 / (kappa + self.a - k.T @ self.Kinv @ k)).item()
            a_d = self.a * d_val

            # Compute v = Kinv @ k
            v = self.Kinv @ k  # (n, 1)
            v_flat = v.ravel()

            # Incremental update of h_diag and Hy
            new_h_last = (d_val * kappa - d_val * (k.T @ v)).item()
            self.h_diag = np.append(self.h_diag - a_d * v_flat**2, new_h_last)

            y_old = self.y[:-1]
            vTy_old = float(v_flat @ y_old)
            self.Hy = np.append(self.Hy + a_d * v_flat * (y - vTy_old), a_d * vTy_old + new_h_last * y)

            self.K = self._update_K(self.K, k, kappa)
            self.Kinv = self._update_Kinv(self.Kinv, k, d_val)
            self.X = np.append(self.X, x.reshape(1, -1), axis=0)

`online_cp.CPS.NearestNeighboursPredictionMachine` ¶

Bases: ConformalPredictiveSystem

Source code in src/online_cp/CPS.py

class NearestNeighboursPredictionMachine(ConformalPredictiveSystem):

    _SAVE_PARAMS: tuple = ("k", "distance", "distance_func", "epsilon")
    _SAVE_STATE: tuple = ("X", "y", "D")
    _SAVE_CALLABLES: tuple = ("distance_func",)
    _PARAM_MAP: dict = {"distance_func": "_distance_func_arg"}

    def __init__(
        self,
        k,
        distance="euclidean",
        distance_func=None,
        epsilon=default_epsilon,
    ):
        super().__init__(epsilon=epsilon)

        self.k = k

        self.distance = distance
        if distance_func is None:
            self.distance_func = self._standard_distance_func
        else:
            self.distance_func = distance_func
            self.distance = "custom"
        self._distance_func_arg = distance_func

        self.X = None
        self.y = None
        self.D = None

    def _standard_distance_func(self, X, y=None):
        """
        By default we use scipy to compute distances
        """
        X = np.atleast_2d(X)
        if y is None:
            dists = squareform(pdist(X, metric=self.distance))
        else:
            y = np.atleast_2d(y)
            dists = cdist(X, y, metric=self.distance)
        return dists

    def learn_initial_training_set(self, X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None:
        """
        >>> cps = NearestNeighboursPredictionMachine(k=3)
        >>> X = np.array([[1], [2]])
        >>> y = np.array([1, 2])
        >>> cps.learn_initial_training_set(X, y)
        >>> cps.X
        array([[1],
               [2]])
        >>> cps.y
        array([1, 2])
        >>> cps.D
        array([[0., 1.],
               [1., 0.]])
        """
        self.X = X
        self.D = self.distance_func(X)
        self.y = y

    @staticmethod
    def update_distance_matrix(D, d):
        return np.block([[D, d], [d.T, np.array([0])]])

    def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
        """
        Learn a single example.

        precomputed is a dictionary
        {
            'X': X,
            'D': D,
        }
        >>> cps = NearestNeighboursPredictionMachine(k=3)
        >>> X = np.array([[1], [2]])
        >>> y = np.array([1, 2])
        >>> cps.learn_initial_training_set(X, y)
        >>> cps.learn_one(np.array([3]), 3)
        >>> cps.y
        array([1, 2, 3])
        >>> cps.X
        array([[1],
               [2],
               [3]])
        >>> cps.D
        array([[0., 1., 2.],
               [1., 0., 1.],
               [2., 1., 0.]])
        """
        # Learn label y
        if self.y is None:
            self.y = np.array([y])
        else:
            self.y = np.append(self.y, y)

        if precomputed is None:
            # Learn object x
            if self.X is None:
                self.X = x.reshape(1, -1)
                self.D = self.distance_func(self.X)
            else:
                d = self.distance_func(self.X, x)
                self.D = self.update_distance_matrix(self.D, d)
                self.X = np.append(self.X, x.reshape(1, -1), axis=0)
        else:
            self.X = precomputed["X"]
            self.D = precomputed["D"]

    def predict_cpd(self, x, return_update=False):
        """
        >>> import numpy as np
        >>> rnd_gen = np.random.default_rng(2024)
        >>> X = rnd_gen.normal(loc=0, scale=1, size=(100, 4))
        >>> beta = np.array([2, 1, 0, 0])
        >>> Y = X @ beta + rnd_gen.normal(loc=0, scale=1, size=100)
        >>> cps = NearestNeighboursPredictionMachine(k=3)
        >>> cps.learn_initial_training_set(X, Y)
        >>> x = rnd_gen.normal(loc=0, scale=1, size=(1, 4))
        >>> cpd = cps.predict_cpd(x)
        >>> cpd.L
        array([0.        , 0.        , 0.18811881, 0.53465347, 0.76237624])
        >>> cpd.U
        array([0.17821782, 0.18811881, 0.54455446, 0.76237624, 1.        ])
        """
        # Temporarily update the distance matrix
        if self.X.shape[0] <= self.k:
            raise ValueError("Training set is too small for k-NN prediction")
        d = self.distance_func(self.X, x)
        D = self.update_distance_matrix(self.D, d)
        y = np.append(self.y, -np.inf)  # Initialise label as -inf

        # Find all neighbours and semi-neighbours
        # Use argpartition for O(n) selection of k+1 smallest, then sort them
        # for deterministic tie-breaking consistent with full argsort.
        top_k1 = np.argpartition(D, self.k + 1, axis=0)[: self.k + 1]
        for col in range(D.shape[1]):
            idx = top_k1[:, col]
            order = np.argsort(D[idx, col])
            top_k1[:, col] = idx[order]
        k_nearest = top_k1[1:]  # skip self (distance=0, always first after sort)

        n = self.X.shape[0]

        full_neighbours = set()
        single_neighbours = set()
        semi_neighbours = set()

        k_nearest_of_n = set(k_nearest.T[-1])

        for i, col in enumerate(k_nearest.T):
            i_is_neighbour_of_n = i in k_nearest_of_n
            n_is_neighbour_of_i = n in col
            if i_is_neighbour_of_n and n_is_neighbour_of_i:
                full_neighbours.add(i)
            elif i_is_neighbour_of_n:
                single_neighbours.add(i)
            elif n_is_neighbour_of_i:
                semi_neighbours.add(i)

        neighbours = full_neighbours | single_neighbours
        full_or_semi = full_neighbours | semi_neighbours
        idx_all_neighbours_and_semi_neighbours = np.array(
            sorted(full_neighbours | single_neighbours | semi_neighbours)
        )

        # Line 1
        Kprime = len(idx_all_neighbours_and_semi_neighbours)
        # Line 2 and 3
        Y = np.zeros(shape=Kprime + 2)
        Y[0] = -np.inf
        Y[-1] = np.inf
        Y[1:-1] = y[idx_all_neighbours_and_semi_neighbours]
        idx_mem = {i: idx_all_neighbours_and_semi_neighbours[i - 1] for i in range(1, Kprime + 1)}
        sorted_indices = np.argsort(Y)[1:-1]
        Y.sort()

        # Line 4: conformity scores and histogram
        Alpha = np.array([(y[k_nearest.T[i]] <= y_i).sum() for i, y_i in enumerate(y)])
        N = np.array([(Alpha == k).sum() for k in range(self.k + 1)])

        # Line 5
        L = -np.inf * np.ones(Kprime + 1)
        U = -np.inf * np.ones(Kprime + 1)
        L[0] = 0
        U[0] = N[0] / (n + 1)

        # Line 6
        for k in range(1, Kprime + 1):
            idx = idx_mem[sorted_indices[k - 1]]
            if idx in neighbours:
                N[Alpha[-1]] -= 1
                Alpha[-1] += 1
                N[Alpha[-1]] += 1
            if idx in full_or_semi:
                N[Alpha[idx]] -= 1
                Alpha[idx] -= 1
                N[Alpha[idx]] += 1
            L[k] = N[: Alpha[-1]].sum() / (n + 1) if Alpha[-1] != 0 else 0
            U[k] = N[: Alpha[-1] + 1].sum() / (n + 1) if Alpha[-1] != 0 else N[0] / (n + 1)

        # Line 12
        cpd = NearestNeighboursPredictiveDistributionFunction(L, U, Y, epsilon=self.epsilon)

        if return_update:
            X = np.append(self.X, x.reshape(1, -1), axis=0)
            return cpd, {"X": X, "D": D}
        else:
            return cpd

`learn_initial_training_set(X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None` ¶

cps = NearestNeighboursPredictionMachine(k=3) X = np.array([[1], [2]]) y = np.array([1, 2]) cps.learn_initial_training_set(X, y) cps.X array([[1], [2]]) cps.y array([1, 2]) cps.D array([[0., 1.], [1., 0.]])

Source code in src/online_cp/CPS.py

def learn_initial_training_set(self, X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None:
    """
    >>> cps = NearestNeighboursPredictionMachine(k=3)
    >>> X = np.array([[1], [2]])
    >>> y = np.array([1, 2])
    >>> cps.learn_initial_training_set(X, y)
    >>> cps.X
    array([[1],
           [2]])
    >>> cps.y
    array([1, 2])
    >>> cps.D
    array([[0., 1.],
           [1., 0.]])
    """
    self.X = X
    self.D = self.distance_func(X)
    self.y = y

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

Learn a single example.

precomputed is a dictionary { 'X': X, 'D': D, }

cps = NearestNeighboursPredictionMachine(k=3) X = np.array([[1], [2]]) y = np.array([1, 2]) cps.learn_initial_training_set(X, y) cps.learn_one(np.array([3]), 3) cps.y array([1, 2, 3]) cps.X array([[1], [2], [3]]) cps.D array([[0., 1., 2.], [1., 0., 1.], [2., 1., 0.]])

Source code in src/online_cp/CPS.py

def learn_one(self, x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None:
    """
    Learn a single example.

    precomputed is a dictionary
    {
        'X': X,
        'D': D,
    }
    >>> cps = NearestNeighboursPredictionMachine(k=3)
    >>> X = np.array([[1], [2]])
    >>> y = np.array([1, 2])
    >>> cps.learn_initial_training_set(X, y)
    >>> cps.learn_one(np.array([3]), 3)
    >>> cps.y
    array([1, 2, 3])
    >>> cps.X
    array([[1],
           [2],
           [3]])
    >>> cps.D
    array([[0., 1., 2.],
           [1., 0., 1.],
           [2., 1., 0.]])
    """
    # Learn label y
    if self.y is None:
        self.y = np.array([y])
    else:
        self.y = np.append(self.y, y)

    if precomputed is None:
        # Learn object x
        if self.X is None:
            self.X = x.reshape(1, -1)
            self.D = self.distance_func(self.X)
        else:
            d = self.distance_func(self.X, x)
            self.D = self.update_distance_matrix(self.D, d)
            self.X = np.append(self.X, x.reshape(1, -1), axis=0)
    else:
        self.X = precomputed["X"]
        self.D = precomputed["D"]

`predict_cpd(x, return_update=False)` ¶

import numpy as np rnd_gen = np.random.default_rng(2024) X = rnd_gen.normal(loc=0, scale=1, size=(100, 4)) beta = np.array([2, 1, 0, 0]) Y = X @ beta + rnd_gen.normal(loc=0, scale=1, size=100) cps = NearestNeighboursPredictionMachine(k=3) cps.learn_initial_training_set(X, Y) x = rnd_gen.normal(loc=0, scale=1, size=(1, 4)) cpd = cps.predict_cpd(x) cpd.L array([0. , 0. , 0.18811881, 0.53465347, 0.76237624]) cpd.U array([0.17821782, 0.18811881, 0.54455446, 0.76237624, 1. ])

Source code in src/online_cp/CPS.py

def predict_cpd(self, x, return_update=False):
    """
    >>> import numpy as np
    >>> rnd_gen = np.random.default_rng(2024)
    >>> X = rnd_gen.normal(loc=0, scale=1, size=(100, 4))
    >>> beta = np.array([2, 1, 0, 0])
    >>> Y = X @ beta + rnd_gen.normal(loc=0, scale=1, size=100)
    >>> cps = NearestNeighboursPredictionMachine(k=3)
    >>> cps.learn_initial_training_set(X, Y)
    >>> x = rnd_gen.normal(loc=0, scale=1, size=(1, 4))
    >>> cpd = cps.predict_cpd(x)
    >>> cpd.L
    array([0.        , 0.        , 0.18811881, 0.53465347, 0.76237624])
    >>> cpd.U
    array([0.17821782, 0.18811881, 0.54455446, 0.76237624, 1.        ])
    """
    # Temporarily update the distance matrix
    if self.X.shape[0] <= self.k:
        raise ValueError("Training set is too small for k-NN prediction")
    d = self.distance_func(self.X, x)
    D = self.update_distance_matrix(self.D, d)
    y = np.append(self.y, -np.inf)  # Initialise label as -inf

    # Find all neighbours and semi-neighbours
    # Use argpartition for O(n) selection of k+1 smallest, then sort them
    # for deterministic tie-breaking consistent with full argsort.
    top_k1 = np.argpartition(D, self.k + 1, axis=0)[: self.k + 1]
    for col in range(D.shape[1]):
        idx = top_k1[:, col]
        order = np.argsort(D[idx, col])
        top_k1[:, col] = idx[order]
    k_nearest = top_k1[1:]  # skip self (distance=0, always first after sort)

    n = self.X.shape[0]

    full_neighbours = set()
    single_neighbours = set()
    semi_neighbours = set()

    k_nearest_of_n = set(k_nearest.T[-1])

    for i, col in enumerate(k_nearest.T):
        i_is_neighbour_of_n = i in k_nearest_of_n
        n_is_neighbour_of_i = n in col
        if i_is_neighbour_of_n and n_is_neighbour_of_i:
            full_neighbours.add(i)
        elif i_is_neighbour_of_n:
            single_neighbours.add(i)
        elif n_is_neighbour_of_i:
            semi_neighbours.add(i)

    neighbours = full_neighbours | single_neighbours
    full_or_semi = full_neighbours | semi_neighbours
    idx_all_neighbours_and_semi_neighbours = np.array(
        sorted(full_neighbours | single_neighbours | semi_neighbours)
    )

    # Line 1
    Kprime = len(idx_all_neighbours_and_semi_neighbours)
    # Line 2 and 3
    Y = np.zeros(shape=Kprime + 2)
    Y[0] = -np.inf
    Y[-1] = np.inf
    Y[1:-1] = y[idx_all_neighbours_and_semi_neighbours]
    idx_mem = {i: idx_all_neighbours_and_semi_neighbours[i - 1] for i in range(1, Kprime + 1)}
    sorted_indices = np.argsort(Y)[1:-1]
    Y.sort()

    # Line 4: conformity scores and histogram
    Alpha = np.array([(y[k_nearest.T[i]] <= y_i).sum() for i, y_i in enumerate(y)])
    N = np.array([(Alpha == k).sum() for k in range(self.k + 1)])

    # Line 5
    L = -np.inf * np.ones(Kprime + 1)
    U = -np.inf * np.ones(Kprime + 1)
    L[0] = 0
    U[0] = N[0] / (n + 1)

    # Line 6
    for k in range(1, Kprime + 1):
        idx = idx_mem[sorted_indices[k - 1]]
        if idx in neighbours:
            N[Alpha[-1]] -= 1
            Alpha[-1] += 1
            N[Alpha[-1]] += 1
        if idx in full_or_semi:
            N[Alpha[idx]] -= 1
            Alpha[idx] -= 1
            N[Alpha[idx]] += 1
        L[k] = N[: Alpha[-1]].sum() / (n + 1) if Alpha[-1] != 0 else 0
        U[k] = N[: Alpha[-1] + 1].sum() / (n + 1) if Alpha[-1] != 0 else N[0] / (n + 1)

    # Line 12
    cpd = NearestNeighboursPredictiveDistributionFunction(L, U, Y, epsilon=self.epsilon)

    if return_update:
        X = np.append(self.X, x.reshape(1, -1), axis=0)
        return cpd, {"X": X, "D": D}
    else:
        return cpd

`online_cp.CPS.DempsterHillConformalPredictiveSystem` ¶

Bases: ConformalPredictiveSystem

Label-only conformal predictive system (Dempster–Hill).

The simplest CPS: it ignores the objects \(x\) entirely and builds the predictive distribution from the labels alone, placing the test label among the sorted training labels. This is the conformalised version of Hill's \(A_{(n)}\) assumption / Dempster's direct probabilities, and serves as a distribution-free baseline (the predictive distribution of an exchangeable sequence with no covariate information). See [ALRW2 §7.5].

Source code in src/online_cp/CPS.py

class DempsterHillConformalPredictiveSystem(ConformalPredictiveSystem):
    r"""Label-only conformal predictive system (Dempster–Hill).

    The simplest CPS: it ignores the objects $x$ entirely and builds the
    predictive distribution from the labels alone, placing the test label among
    the sorted training labels. This is the conformalised version of Hill's
    $A_{(n)}$ assumption / Dempster's direct probabilities, and serves as a
    distribution-free baseline (the predictive distribution of an exchangeable
    sequence with no covariate information). See [ALRW2 §7.5].
    """

    _SAVE_PARAMS: tuple = ("epsilon",)
    _SAVE_STATE: tuple = ("y",)

    def __init__(self, epsilon=default_epsilon):
        """Create a Dempster–Hill CPS.

        Only the labels are used, so objects passed to the ``learn_*`` methods
        are ignored.

        Parameters
        ----------
        epsilon : float, default 0.1
            Default significance level.
        """
        super().__init__(epsilon=epsilon)
        self.y = None

    def learn_initial_training_set(self, y):
        self.y = y

    def learn_one(self, y):
        self.y = np.append(self.y, y)

    def learn_many(self, y):
        self.y = np.append(self.y, y)

    def predict(self):
        return self.predict_cpd()

    def predict_cpd(self):
        Y = np.zeros(shape=self.y.shape[0] + 2)
        Y[0] = -np.inf
        Y[-1] = np.inf
        Y[1:-1] = self.y
        Y.sort()

        return DempsterHillConformalPredictiveDistribution(Y, epsilon=self.epsilon)

`init(epsilon=default_epsilon)` ¶

Create a Dempster–Hill CPS.

Only the labels are used, so objects passed to the learn_* methods are ignored.

Parameters:

Name	Type	Description	Default
`epsilon`	`float`	Default significance level.	`0.1`

Source code in src/online_cp/CPS.py

def __init__(self, epsilon=default_epsilon):
    """Create a Dempster–Hill CPS.

    Only the labels are used, so objects passed to the ``learn_*`` methods
    are ignored.

    Parameters
    ----------
    epsilon : float, default 0.1
        Default significance level.
    """
    super().__init__(epsilon=epsilon)
    self.y = None

Conformal Predictive Systems¶

online_cp.CPS.RidgePredictionMachine ¶

learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None ¶

change_ridge_parameter(a) ¶

check_matrix_rank(M) ¶

online_cp.CPS.KernelRidgePredictionMachine ¶

change_ridge_parameter(a) ¶

learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None ¶

online_cp.CPS.NearestNeighboursPredictionMachine ¶

learn_initial_training_set(X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None ¶

learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None ¶

predict_cpd(x, return_update=False) ¶

online_cp.CPS.DempsterHillConformalPredictiveSystem ¶

__init__(epsilon=default_epsilon) ¶

`online_cp.CPS.RidgePredictionMachine` ¶

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

`change_ridge_parameter(a)` ¶

`check_matrix_rank(M)` ¶

`online_cp.CPS.KernelRidgePredictionMachine` ¶

`change_ridge_parameter(a)` ¶

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

`online_cp.CPS.NearestNeighboursPredictionMachine` ¶

`learn_initial_training_set(X: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]) -> None` ¶

`learn_one(x: NDArray[np.floating[Any]], y: float, precomputed: dict[str, Any] | None = None) -> None` ¶

`predict_cpd(x, return_update=False)` ¶

`online_cp.CPS.DempsterHillConformalPredictiveSystem` ¶

`init(epsilon=default_epsilon)` ¶