Skip to content

Commit

Permalink
Merge pull request #23 from gibsramen/fix-log-lik
Browse files Browse the repository at this point in the history
Update log-likelihood & posterior predictive import to inference
  • Loading branch information
gibsramen authored Mar 31, 2021
2 parents 4906aa7 + 1eb7938 commit fcf2e61
Show file tree
Hide file tree
Showing 15 changed files with 131 additions and 244 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*.hpp
*.DS_Store
birdman/templates/negative_binomial
birdman/templates/multinomial
birdman/templates/negative_binomial_single
birdman/templates/negative_binomial_lme
tests/custom_model
Expand Down
71 changes: 70 additions & 1 deletion birdman/default_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from pkg_resources import resource_filename

import arviz as az
import biom
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -87,11 +88,45 @@ def __init__(
seed, parallelize_across)

param_dict = {
"depth": np.log(table.sum(axis="sample")), # sampling depths
"B_p": beta_prior,
"phi_s": cauchy_scale
}
self.add_parameters(param_dict)

def to_inference_object(self) -> az.InferenceData:
"""Convert fitted Stan model into ``arviz`` InferenceData object.
:returns: ``arviz`` InferenceData object with selected values
:rtype: az.InferenceData
"""
dims = {
"beta": ["covariate", "feature"],
"phi": ["feature"],
"log_lhood": ["tbl_sample", "feature"],
"y_predict": ["tbl_sample", "feature"]
}
coords = {
"covariate": self.colnames,
"feature": self.feature_names,
"tbl_sample": self.sample_names
}

# TODO: May want to allow not passing PP/LL/OD in the future
args = dict()
if self.parallelize_across == "chains":
args["alr_params"] = ["beta"]
inf = super().to_inference_object(
params=["beta", "phi"],
dims=dims,
coords=coords,
posterior_predictive="y_predict",
log_likelihood="log_lhood",
include_observed_data=True,
**args
)
return inf


class NegativeBinomialLME(Model):
"""Fit count data using negative binomial model considering subject as
Expand Down Expand Up @@ -148,7 +183,7 @@ class NegativeBinomialLME(Model):
:type cauchy_scale: float
:param group_var_prior: Standard deviation for normally distributed prior
prior values of group_var, defaults to 1.0
values of group_var, defaults to 1.0
:type group_var_prior: float
"""
def __init__(
Expand All @@ -175,6 +210,7 @@ def __init__(
self.groups = np.sort(group_var_series.unique())

param_dict = {
"depth": np.log(table.sum(axis="sample")), # sampling depths
"B_p": beta_prior,
"phi_s": cauchy_scale,
"S": len(group_var_series.unique()),
Expand All @@ -183,6 +219,38 @@ def __init__(
}
self.add_parameters(param_dict)

def to_inference_object(self) -> az.InferenceData:
"""Convert fitted Stan model into ``arviz`` InferenceData object.
:returns: ``arviz`` InferenceData object with selected values
:rtype: az.InferenceData
"""
dims = {
"beta": ["covariate", "feature"],
"phi": ["feature"],
"subj_int": ["group", "feature"],
"log_lhood": ["tbl_sample", "feature"],
"y_predict": ["tbl_sample", "feature"]
}
coords = {
"covariate": self.colnames,
"feature": self.feature_names,
"tbl_sample": self.sample_names,
"group": self.groups
}

# TODO: May want to allow not passing PP/LL/OD in the future
inf = super().to_inference_object(
params=["beta", "phi"],
dims=dims,
coords=coords,
posterior_predictive="y_predict",
log_likelihood="log_lhood",
alr_params=["beta", "subj_int"],
include_observed_data=True,
)
return inf


class Multinomial(Model):
"""Fit count data using serial multinomial model.
Expand Down Expand Up @@ -237,6 +305,7 @@ def __init__(
seed, parallelize_across="chains")

param_dict = {
"depth": table.sum(axis="sample").astype(int), # sampling depths
"B_p": beta_prior,
}
self.add_parameters(param_dict)
4 changes: 0 additions & 4 deletions birdman/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import biom
from cmdstanpy import CmdStanModel, CmdStanMCMC
import dask
import numpy as np
import pandas as pd
from patsy import dmatrix

Expand Down Expand Up @@ -73,7 +72,6 @@ def __init__(
"D": table.shape[0],
"N": table.shape[1], # number of samples
"p": self.dmat.shape[1], # number of covariates
"depth": np.log(table.sum(axis="sample")), # sampling depths
"x": self.dmat.values, # design matrix
}

Expand Down Expand Up @@ -206,15 +204,13 @@ def to_inference_object(
"dims": dims,
"posterior_predictive": posterior_predictive,
"log_likelihood": log_likelihood,
"sample_names": self.sample_names,
}
if isinstance(self.fit, CmdStanMCMC):
fit_to_inference = single_fit_to_inference
args["alr_params"] = alr_params
elif isinstance(self.fit, Sequence):
fit_to_inference = multiple_fits_to_inference
args["concatenation_name"] = concatenation_name
args["feature_names"] = self.feature_names
# TODO: Check that dims and concatenation_match

if alr_params is not None:
Expand Down
52 changes: 7 additions & 45 deletions birdman/model_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def single_fit_to_inference(
alr_params: Sequence[str] = None,
posterior_predictive: str = None,
log_likelihood: str = None,
sample_names: Sequence[str] = None,
) -> az.InferenceData:
"""Convert fitted Stan model into inference object.
Expand All @@ -44,30 +43,18 @@ def single_fit_to_inference(
:param log_likelihood: Name of log likelihood values from Stan model
to include in ``arviz`` InferenceData object
:type log_likelihood: str, optional
:param sample_names: Sample names to label PP and/or LL
:type sample_names: Sequence[str]
:returns: ``arviz`` InferenceData object with selected values
:rtype: az.InferenceData
"""
# remove alr params so initial dim fitting works
new_dims = {k: v for k, v in dims.items() if k not in alr_params}

extra_dims = dict()
extra_coords = dict()
if log_likelihood is not None:
extra_dims.update({log_likelihood: ["tbl_sample", "feature"]})
extra_coords.update({"tbl_sample": sample_names})
if posterior_predictive is not None:
extra_dims.update({posterior_predictive: ["tbl_sample", "feature"]})
extra_coords.update({"tbl_sample": sample_names})

new_dims.update(extra_dims)
if log_likelihood is not None and log_likelihood not in dims:
raise KeyError("Must include dimensions for log-likelihood!")
if posterior_predictive is not None and posterior_predictive not in dims:
raise KeyError("Must include dimensions for posterior predictive!")

inference = az.from_cmdstanpy(
posterior=fit,
coords={**coords, **extra_coords},
coords=coords,
log_likelihood=log_likelihood,
posterior_predictive=posterior_predictive,
dims=new_dims
Expand Down Expand Up @@ -120,10 +107,8 @@ def multiple_fits_to_inference(
coords: dict,
dims: dict,
concatenation_name: str,
feature_names: Sequence[str],
posterior_predictive: str = None,
log_likelihood: str = None,
sample_names: Sequence[str] = None,
) -> az.InferenceData:
"""Save fitted parameters to xarray DataSet for multiple fits.
Expand Down Expand Up @@ -151,12 +136,6 @@ def multiple_fits_to_inference(
to include in ``arviz`` InferenceData object
:type log_likelihood: str, optional
:param sample_names: Sample names to label PP and/or LL
:type sample_names: Sequence[str]
:param feature_names: Feature names to label concatenation
:type feature_name: Sequence[str]
:returns: ``arviz`` InferenceData object with selected values
:rtype: az.InferenceData
"""
Expand Down Expand Up @@ -197,12 +176,10 @@ def multiple_fits_to_inference(
group_dict = {"posterior": po_ds, "sample_stats": ss_ds}

if log_likelihood is not None:
ll_ds = _concat_table_draws(log_likelihood, ll_list, sample_names,
"feature")
ll_ds = xr.concat(ll_list, concatenation_name)
group_dict["log_likelihood"] = ll_ds
if posterior_predictive is not None:
pp_ds = _concat_table_draws(posterior_predictive, pp_list,
sample_names, "feature")
pp_ds = xr.concat(pp_list, concatenation_name)
group_dict["posterior_predictive"] = pp_ds

all_group_inferences = []
Expand Down Expand Up @@ -232,18 +209,3 @@ def _drop_data(
dims_to_drop.append(dim)
new_dataset = new_dataset.drop_dims(dims_to_drop)
return new_dataset


def _concat_table_draws(
group: str,
da_list: Sequence[xr.DataArray],
sample_names: Sequence[str],
concatenation_name: str
) -> xr.Dataset:
"""For posterior predictive & log likelihood."""
ds = xr.concat(da_list, concatenation_name)
dim_name = f"{group}_dim_0"
ds = ds.rename_dims({dim_name: "tbl_sample"})
ds = ds.assign_coords({"tbl_sample": sample_names})
ds = ds.reset_coords([dim_name], drop=True)
return ds
5 changes: 2 additions & 3 deletions birdman/templates/multinomial.stan
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ data {
int<lower=0> N; // number of samples
int<lower=0> D; // number of dimensions
int<lower=0> p; // number of covariates
real depth[N]; // sequencing depths of microbes
int depth[N]; // sequencing depths of microbes
matrix[N, p] x; // covariate matrix
int y[N, D]; // observed microbe abundances
real<lower=0> B_p; // stdev for Beta Normal prior
Expand All @@ -19,9 +19,8 @@ transformed parameters {
vector[N] z;
simplex[D] theta[N];

z = to_vector(rep_array(0, N));
lam = x * beta;
lam_clr = append_col(z, lam);
lam_clr = append_col(to_vector(rep_array(0, N)), lam);
for (n in 1:N){
theta[n] = softmax(to_vector(lam_clr[n,]));
}
Expand Down
10 changes: 4 additions & 6 deletions birdman/templates/negative_binomial.stan
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ data {
int<lower=0> N; // number of samples
int<lower=0> D; // number of dimensions
int<lower=0> p; // number of covariates
real depth[N]; // sequencing depths of microbes
real depth[N]; // log sequencing depths of microbes
matrix[N, p] x; // covariate matrix
int y[N, D]; // observed microbe abundances
real<lower=0> B_p; // stdev for Beta Normal prior
Expand All @@ -18,16 +18,14 @@ parameters {
transformed parameters {
matrix[N, D-1] lam;
matrix[N, D] lam_clr;
vector[N] z;
vector<lower=0>[D] phi;

for (i in 1:D){
phi[i] = 1. / reciprocal_phi[i];
}

z = to_vector(rep_array(0, N));
lam = x * beta;
lam_clr = append_col(z, lam);
lam_clr = append_col(to_vector(rep_array(0, N)), lam);
}

model {
Expand All @@ -50,12 +48,12 @@ model {

generated quantities {
matrix[N, D] y_predict;
matrix[N, D] log_lik;
matrix[N, D] log_lhood;

for (n in 1:N){
for (i in 1:D){
y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]);
log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
}
}
}
6 changes: 3 additions & 3 deletions birdman/templates/negative_binomial_lme.stan
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ data {
int<lower=0> S; // number of groups (subjects)
int<lower=0> D; // number of dimensions
int<lower=0> p; // number of covariates
real depth[N]; // sequencing depths of microbes
real depth[N]; // log sequencing depths of microbes
matrix[N, p] x; // covariate matrix
int<lower=1, upper=S> subj_ids[N]; // mapping of samples to subject IDs
int y[N, D]; // observed microbe abundances
Expand Down Expand Up @@ -58,12 +58,12 @@ model {

generated quantities {
matrix[N, D] y_predict;
matrix[N, D] log_lik;
matrix[N, D] log_lhood;

for (n in 1:N){
for (i in 1:D){
y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]);
log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
}
}
}
6 changes: 3 additions & 3 deletions birdman/templates/negative_binomial_single.stan
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
data {
int<lower=0> N; // number of samples
int<lower=0> p; // number of covariates
real depth[N]; // sequencing depths of microbe
real depth[N]; // log sequencing depths of microbe
matrix[N, p] x; // covariate matrix
int y[N]; // observed microbe abundances
real<lower=0> B_p; // stdev for Beta Normal prior
Expand Down Expand Up @@ -35,11 +35,11 @@ model {
}

generated quantities {
vector[N] log_lik;
vector[N] log_lhood;
vector[N] y_predict;

for (n in 1:N){
y_predict[n] = neg_binomial_2_log_rng(depth[n] + lam[n], phi);
log_lik[n] = neg_binomial_2_log_lpmf(y[n] | depth[n] + lam[n], phi);
log_lhood[n] = neg_binomial_2_log_lpmf(y[n] | depth[n] + lam[n], phi);
}
}
18 changes: 1 addition & 17 deletions docs/default_model_example.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,9 @@ We then have to compile and fit our model. This is very straightforward in BIRDM
Now we have our parameter estimates which we can use in downstream analyses. Many of BIRDMAn's included analysis & visualization functions take an ``arviz.InferenceData`` object. We provide a simple method to convert your BIRDMAn fit into this data structure.

The ``coords`` & ``dims`` arguments follow the ``xarray`` data structure (as that is what ``arviz`` wraps). In this example we want to keep the fitted ``beta`` and ``phi`` parameters and we annotate the dimensions as ``covariate × feature`` and ``feature`` respectively. We also provide the covariate names and feature names. Next, we specify that we want to include the ``log likelihood`` and ``posterior predictive`` values we calculated while model fitting. These will be useful for diagnostics. We include the observed data (the original counts in the feature table) so we can see how well our model did.

.. code-block:: python
inference = nb.to_inference_object(
params=["beta", "phi"],
coords={
"feature": nb.feature_names,
"covariate": nb.colnames,
},
dims={
"beta": ["covariate", "feature"],
"phi": ["feature"],
},
alr_params=["beta"],
posterior_predictive="y_predict",
log_likelihood="log_lik",
include_observed_data=True
)
inference = nb.to_inference_object()
Finally, we'll plot the feature differentials and their standard deviations. We specify that we are interested in the ``diet[T.DIO]`` differentials but you can easily plot whichever parameter you like through the combination of the ``parameter`` and ``coord`` arguments.

Expand Down
Loading

0 comments on commit fcf2e61

Please sign in to comment.