Skip to content

Commit

Permalink
Feat cloglog (#651)
Browse files Browse the repository at this point in the history
* Add cloglog link function

Useful for discrete time survival models.
Somewhat related to #464.
More info: https://grodri.github.io/glms/notes/c7s6

* Improve numerical accuracy

* Fix tests (restrict test values)

* Add 'cloglog' to `GeneralizedLinearRegressor` opts

* Add changelog entry

* Apply suggestions from code review

Fix docstrings

Co-authored-by: Luca Bittarello <[email protected]>

* Add test cloglog test against other packages

---------

Co-authored-by: Luca Bittarello <[email protected]>
  • Loading branch information
MartinStancsicsQC and lbittarello authored Jun 12, 2023
1 parent b712a49 commit 5e94d17
Show file tree
Hide file tree
Showing 7 changed files with 230 additions and 12 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
Changelog
=========

2.6.0 - UNRELEASED
------------------

**New feature**

- Added the complementary log-log (`cloglog`) link function.

2.5.2 - 2023-06-02
------------------

Expand Down
3 changes: 2 additions & 1 deletion src/glum/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from ._glm import GeneralizedLinearRegressor, get_family, get_link
from ._glm_cv import GeneralizedLinearRegressorCV
from ._link import IdentityLink, Link, LogitLink, LogLink, TweedieLink
from ._link import CloglogLink, IdentityLink, Link, LogitLink, LogLink, TweedieLink

try:
__version__ = pkg_resources.get_distribution(__name__).version
Expand All @@ -35,6 +35,7 @@
"LogitLink",
"LogLink",
"TweedieLink",
"CloglogLink",
"GeneralizedLinearRegressor",
"GeneralizedLinearRegressorCV",
"get_family",
Expand Down
15 changes: 9 additions & 6 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
TweedieDistribution,
guess_intercept,
)
from ._link import IdentityLink, Link, LogitLink, LogLink, TweedieLink
from ._link import CloglogLink, IdentityLink, Link, LogitLink, LogLink, TweedieLink
from ._solvers import (
IRLSData,
_cd_solver,
Expand Down Expand Up @@ -482,11 +482,14 @@ def get_link(link: Union[str, Link], family: ExponentialDispersionModel) -> Link
return LogLink()
if link == "logit":
return LogitLink()
if link == "cloglog":
return CloglogLink()
if link[:7] == "tweedie":
return TweedieLink(float(link[7:]))
raise ValueError(
"The link must be an instance of class Link or an element of "
f"['auto', 'identity', 'log', 'logit', 'tweedie']; got (link={link})."
"['auto', 'identity', 'log', 'logit', 'cloglog', 'tweedie']; "
f"got (link={link})."
)


Expand Down Expand Up @@ -1964,10 +1967,10 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
specify it in parentheses (e.g., ``'tweedie (1.5)'``). The same applies
for ``'negative.binomial'`` and theta parameter.
link : {'auto', 'identity', 'log', 'logit'} or Link, optional (default='auto')
The link function of the GLM, i.e. mapping from linear predictor
(``X * coef``) to expectation (``mu``). Option ``'auto'`` sets the link
depending on the chosen family as follows:
link : {'auto', 'identity', 'log', 'logit', 'cloglog'} or Link, optional (default='auto')
The link function of the GLM, i.e. mapping from linear
predictor (``X * coef``) to expectation (``mu``). Option ``'auto'`` sets
the link depending on the chosen family as follows:
- ``'identity'`` for family ``'normal'``
- ``'log'`` for families ``'poisson'``, ``'gamma'``,
Expand Down
8 changes: 4 additions & 4 deletions src/glum/_glm_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
specify it in parentheses (e.g., ``'tweedie (1.5)'``). The same applies
for ``'negative.binomial'`` and theta parameter.
link : {'auto', 'identity', 'log', 'logit'} or Link, optional (default='auto')
The link function of the GLM, i.e. mapping from linear predictor
(``X * coef``) to expectation (``mu``). Option ``'auto'`` sets the link
depending on the chosen family as follows:
link : {'auto', 'identity', 'log', 'logit', 'cloglog'} or Link, optional (default='auto')
The link function of the GLM, i.e. mapping from linear
predictor (``X * coef``) to expectation (``mu``). Option ``'auto'`` sets
the link depending on the chosen family as follows:
- ``'identity'`` for family ``'normal'``
- ``'log'`` for families ``'poisson'``, ``'gamma'``,
Expand Down
89 changes: 89 additions & 0 deletions src/glum/_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,92 @@ def inverse_derivative2(self, lin_pred):
result = _asanyarray(lin_pred) ** ((2 * self.p - 1) / (1 - self.p))
result *= self.p / (1 - self.p) ** 2
return result


class CloglogLink(Link):
"""The complementary log-log link function ``log(-log(-p))``."""

def __eq__(self, other): # noqa D
return isinstance(other, self.__class__)

def link(self, mu):
"""Get the logit function of ``mu``.
See superclass documentation.
Parameters
----------
mu: array-like
Returns
-------
numpy.ndarray
"""
mu = _asanyarray(mu)
return np.log(-np.log1p(-mu))

def derivative(self, mu):
"""Get the derivative of the cloglog link.
See superclass documentation.
Parameters
----------
mu: array-like
Returns
-------
array-like
"""
mu = _asanyarray(mu)
return 1.0 / ((mu - 1) * (np.log1p(-mu)))

def inverse(self, lin_pred):
"""Get the inverse of the cloglog link.
See superclass documentation.
Note: since passing a very large value might result in an output of one,
this function bounds the output to be between ``[50*eps, 1 - 50*eps]``,
where ``eps`` is floating point epsilon.
Parameters
----------
lin_pred: array-like
Returns
-------
array-like
"""
lin_pred = _asanyarray(lin_pred)
inv_cloglog = -np.expm1(-np.exp(lin_pred))
eps50 = 50 * np.finfo(inv_cloglog.dtype).eps
if np.any(inv_cloglog > 1 - eps50) or np.any(inv_cloglog < eps50):
warnings.warn(
"Computing sigmoid function gave results too close to 0 or 1. Clipping."
)
return np.clip(inv_cloglog, eps50, 1 - eps50)
return inv_cloglog

def inverse_derivative(self, lin_pred):
"""Compute the derivative of the inverse link function ``h'(lin_pred)``.
Parameters
----------
lin_pred : array, shape (n_samples,)
Usually the (fitted) linear predictor.
"""
lin_pred = _asanyarray(lin_pred)
return np.exp(lin_pred - np.exp(lin_pred))

def inverse_derivative2(self, lin_pred):
"""Compute 2nd derivative of the inverse link function ``h''(lin_pred)``.
Parameters
----------
lin_pred : array, shape (n_samples,)
Usually the (fitted) linear predictor.
"""
lin_pred = _asanyarray(lin_pred)
# TODO: check if numerical stability can be improved
return np.exp(np.exp(lin_pred) - lin_pred) * np.expm1(lin_pred)
114 changes: 114 additions & 0 deletions tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,120 @@ def obj(coef):
assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)


def test_binomial_cloglog_enet():
"""Test elastic net regression with binomial family and cloglog link.
Compare to R's glmnet
"""
# library(glmnet)
# options(digits=10)
# df <- data.frame(a=c(1,2,3,4,5,6), b=c(0,0,0,0,1, 1), y=c(0,0,1,0,1,1))
# x <- data.matrix(df[,c("a", "b")])
# y <- df$y
# fit <- glmnet(
# x=x, y=as.factor(y), lambda=1, alpha=0.5, intercept=TRUE,
# family=binomial(link=cloglog),standardize=FALSE, thresh=1e-10
# )
# coef(fit)
# s1
# (Intercept) -0.9210348370
# a 0.1743465641
# b .
glmnet_intercept = -0.9210348370
glmnet_coef = [0.1743465641, 0.0]
X = np.array([[1, 2, 3, 4, 5, 6], [0, 0, 0, 0, 1, 1]], dtype="float").T
y = np.array([0, 0, 1, 0, 1, 1])
rng = np.random.RandomState(42)
glm = GeneralizedLinearRegressor(
alpha=1,
l1_ratio=0.5,
family="binomial",
link="cloglog",
solver="irls-cd",
gradient_tol=1e-8,
selection="random",
random_state=rng,
)
glm.fit(X, y)
# Note: we use a quite generous tolerance here, but I think we
# might be closer to the truth than glmnet
# In the case of unregularized results, we certainly are closer
# to both statsmodels and stats::glm than glmnet is.
assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-3)
assert_allclose(glm.coef_, glmnet_coef, rtol=1e-3)

# same for start_params='zero' and selection='cyclic'
# with reduced precision
glm = GeneralizedLinearRegressor(
alpha=1,
l1_ratio=0.5,
family="binomial",
link="cloglog",
solver="irls-cd",
gradient_tol=1e-8,
selection="cyclic",
)
glm.fit(X, y)
assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-3)
assert_allclose(glm.coef_, glmnet_coef, rtol=1e-3)

# check warm_start, therefore start with different alpha
glm = GeneralizedLinearRegressor(
alpha=0.005,
l1_ratio=0.5,
family="binomial",
max_iter=300,
link="cloglog",
solver="irls-cd",
gradient_tol=1e-8,
selection="cyclic",
)
glm.fit(X, y)
# warm start with original alpha and use of sparse matrices
glm.warm_start = True
glm.alpha = 1
X = sparse.csr_matrix(X)
glm.fit(X, y)
assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-3)
assert_allclose(glm.coef_, glmnet_coef, rtol=1e-3)


@pytest.mark.parametrize("solver", ["irls-cd", "irls-ls"])
def test_binomial_cloglog_unregularized(solver):
"""Test unregularized regression with binomial family and cloglog link.
Compare to statsmodels GLM.
"""
n_samples = 500
rng = np.random.RandomState(42)
X, y = make_classification(
n_samples=n_samples,
n_classes=2,
n_features=6,
n_informative=5,
n_redundant=0,
n_repeated=0,
random_state=rng,
)
X1 = sm.add_constant(X)
sm_glm = sm.GLM(y, X1, family=sm.families.Binomial(sm.families.links.CLogLog()))
sm_fit = sm_glm.fit()

glum_glm = GeneralizedLinearRegressor(
alpha=0,
family="binomial",
link="cloglog",
solver=solver,
gradient_tol=1e-8,
selection="random",
random_state=rng,
)
glum_glm.fit(X, y)

assert_allclose(glum_glm.intercept_, sm_fit.params[0], rtol=2e-5)
assert_allclose(glum_glm.coef_, sm_fit.params[1:], rtol=2e-5)


@pytest.mark.parametrize("alpha", [0.01, 0.1, 1, 10])
def test_binomial_enet(alpha):
"""Test elastic net regression with binomial family and LogitLink.
Expand Down
6 changes: 5 additions & 1 deletion tests/glm/test_link.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest

from glum._link import Link, LogitLink, LogLink, TweedieLink
from glum._link import CloglogLink, Link, LogitLink, LogLink, TweedieLink


@pytest.mark.parametrize("link", Link.__subclasses__())
Expand All @@ -19,6 +19,10 @@ def test_link_properties(link):
# careful for large x, note expit(36) = 1
# limit max eta to 15
x = x / 100 * 15
if isinstance(link, CloglogLink):
# limit max eta to 3
# also check negative values as link is not symmetric
x = x / 100 * 6 - 3

np.testing.assert_allclose(link.link(link.inverse(x)), x)
# if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
Expand Down

0 comments on commit 5e94d17

Please sign in to comment.