Merge pull request #23 from gibsramen/fix-log-lik

Update log-likelihood & posterior predictive import to inference
biocore · Mar 31, 2021 · fcf2e61 · fcf2e61
2 parents 4906aa7 + 1eb7938
commit fcf2e61
Show file tree

Hide file tree

Showing 15 changed files with 131 additions and 244 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 *.hpp
 *.DS_Store
 birdman/templates/negative_binomial
+birdman/templates/multinomial
 birdman/templates/negative_binomial_single
 birdman/templates/negative_binomial_lme
 tests/custom_model

diff --git a/birdman/default_models.py b/birdman/default_models.py
@@ -1,6 +1,7 @@
 import os
 from pkg_resources import resource_filename
 
+import arviz as az
 import biom
 import numpy as np
 import pandas as pd
@@ -87,11 +88,45 @@ def __init__(
                          seed, parallelize_across)
 
         param_dict = {
+            "depth": np.log(table.sum(axis="sample")),  # sampling depths
             "B_p": beta_prior,
             "phi_s": cauchy_scale
         }
         self.add_parameters(param_dict)
 
+    def to_inference_object(self) -> az.InferenceData:
+        """Convert fitted Stan model into ``arviz`` InferenceData object.
+
+        :returns: ``arviz`` InferenceData object with selected values
+        :rtype: az.InferenceData
+        """
+        dims = {
+            "beta": ["covariate", "feature"],
+            "phi": ["feature"],
+            "log_lhood": ["tbl_sample", "feature"],
+            "y_predict": ["tbl_sample", "feature"]
+        }
+        coords = {
+            "covariate": self.colnames,
+            "feature": self.feature_names,
+            "tbl_sample": self.sample_names
+        }
+
+        # TODO: May want to allow not passing PP/LL/OD in the future
+        args = dict()
+        if self.parallelize_across == "chains":
+            args["alr_params"] = ["beta"]
+        inf = super().to_inference_object(
+            params=["beta", "phi"],
+            dims=dims,
+            coords=coords,
+            posterior_predictive="y_predict",
+            log_likelihood="log_lhood",
+            include_observed_data=True,
+            **args
+        )
+        return inf
+
 
 class NegativeBinomialLME(Model):
     """Fit count data using negative binomial model considering subject as
@@ -148,7 +183,7 @@ class NegativeBinomialLME(Model):
     :type cauchy_scale: float
 
     :param group_var_prior: Standard deviation for normally distributed prior
-        prior values of group_var, defaults to 1.0
+        values of group_var, defaults to 1.0
     :type group_var_prior: float
     """
     def __init__(
@@ -175,6 +210,7 @@ def __init__(
         self.groups = np.sort(group_var_series.unique())
 
         param_dict = {
+            "depth": np.log(table.sum(axis="sample")),  # sampling depths
             "B_p": beta_prior,
             "phi_s": cauchy_scale,
             "S": len(group_var_series.unique()),
@@ -183,6 +219,38 @@ def __init__(
         }
         self.add_parameters(param_dict)
 
+    def to_inference_object(self) -> az.InferenceData:
+        """Convert fitted Stan model into ``arviz`` InferenceData object.
+
+        :returns: ``arviz`` InferenceData object with selected values
+        :rtype: az.InferenceData
+        """
+        dims = {
+            "beta": ["covariate", "feature"],
+            "phi": ["feature"],
+            "subj_int": ["group", "feature"],
+            "log_lhood": ["tbl_sample", "feature"],
+            "y_predict": ["tbl_sample", "feature"]
+        }
+        coords = {
+            "covariate": self.colnames,
+            "feature": self.feature_names,
+            "tbl_sample": self.sample_names,
+            "group": self.groups
+        }
+
+        # TODO: May want to allow not passing PP/LL/OD in the future
+        inf = super().to_inference_object(
+            params=["beta", "phi"],
+            dims=dims,
+            coords=coords,
+            posterior_predictive="y_predict",
+            log_likelihood="log_lhood",
+            alr_params=["beta", "subj_int"],
+            include_observed_data=True,
+        )
+        return inf
+
 
 class Multinomial(Model):
     """Fit count data using serial multinomial model.
@@ -237,6 +305,7 @@ def __init__(
                          seed, parallelize_across="chains")
 
         param_dict = {
+            "depth": table.sum(axis="sample").astype(int),  # sampling depths
             "B_p": beta_prior,
         }
         self.add_parameters(param_dict)
diff --git a/birdman/model_base.py b/birdman/model_base.py
@@ -5,7 +5,6 @@
 import biom
 from cmdstanpy import CmdStanModel, CmdStanMCMC
 import dask
-import numpy as np
 import pandas as pd
 from patsy import dmatrix
 
@@ -73,7 +72,6 @@ def __init__(
             "D": table.shape[0],
             "N": table.shape[1],                        # number of samples
             "p": self.dmat.shape[1],                    # number of covariates
-            "depth": np.log(table.sum(axis="sample")),  # sampling depths
             "x": self.dmat.values,                      # design matrix
         }
 
@@ -206,15 +204,13 @@ def to_inference_object(
             "dims": dims,
             "posterior_predictive": posterior_predictive,
             "log_likelihood": log_likelihood,
-            "sample_names": self.sample_names,
         }
         if isinstance(self.fit, CmdStanMCMC):
             fit_to_inference = single_fit_to_inference
             args["alr_params"] = alr_params
         elif isinstance(self.fit, Sequence):
             fit_to_inference = multiple_fits_to_inference
             args["concatenation_name"] = concatenation_name
-            args["feature_names"] = self.feature_names
             # TODO: Check that dims and concatenation_match
 
             if alr_params is not None:

diff --git a/birdman/model_util.py b/birdman/model_util.py
@@ -17,7 +17,6 @@ def single_fit_to_inference(
     alr_params: Sequence[str] = None,
     posterior_predictive: str = None,
     log_likelihood: str = None,
-    sample_names: Sequence[str] = None,
 ) -> az.InferenceData:
     """Convert fitted Stan model into inference object.
 
@@ -44,30 +43,18 @@ def single_fit_to_inference(
     :param log_likelihood: Name of log likelihood values from Stan model
         to include in ``arviz`` InferenceData object
     :type log_likelihood: str, optional
-
-    :param sample_names: Sample names to label PP and/or LL
-    :type sample_names: Sequence[str]
-
-    :returns: ``arviz`` InferenceData object with selected values
-    :rtype: az.InferenceData
     """
     # remove alr params so initial dim fitting works
     new_dims = {k: v for k, v in dims.items() if k not in alr_params}
 
-    extra_dims = dict()
-    extra_coords = dict()
-    if log_likelihood is not None:
-        extra_dims.update({log_likelihood: ["tbl_sample", "feature"]})
-        extra_coords.update({"tbl_sample": sample_names})
-    if posterior_predictive is not None:
-        extra_dims.update({posterior_predictive: ["tbl_sample", "feature"]})
-        extra_coords.update({"tbl_sample": sample_names})
-
-    new_dims.update(extra_dims)
+    if log_likelihood is not None and log_likelihood not in dims:
+        raise KeyError("Must include dimensions for log-likelihood!")
+    if posterior_predictive is not None and posterior_predictive not in dims:
+        raise KeyError("Must include dimensions for posterior predictive!")
 
     inference = az.from_cmdstanpy(
         posterior=fit,
-        coords={**coords, **extra_coords},
+        coords=coords,
         log_likelihood=log_likelihood,
         posterior_predictive=posterior_predictive,
         dims=new_dims
@@ -120,10 +107,8 @@ def multiple_fits_to_inference(
     coords: dict,
     dims: dict,
     concatenation_name: str,
-    feature_names: Sequence[str],
     posterior_predictive: str = None,
     log_likelihood: str = None,
-    sample_names: Sequence[str] = None,
 ) -> az.InferenceData:
     """Save fitted parameters to xarray DataSet for multiple fits.
 
@@ -151,12 +136,6 @@ def multiple_fits_to_inference(
         to include in ``arviz`` InferenceData object
     :type log_likelihood: str, optional
 
-    :param sample_names: Sample names to label PP and/or LL
-    :type sample_names: Sequence[str]
-
-    :param feature_names: Feature names to label concatenation
-    :type feature_name: Sequence[str]
-
     :returns: ``arviz`` InferenceData object with selected values
     :rtype: az.InferenceData
     """
@@ -197,12 +176,10 @@ def multiple_fits_to_inference(
     group_dict = {"posterior": po_ds, "sample_stats": ss_ds}
 
     if log_likelihood is not None:
-        ll_ds = _concat_table_draws(log_likelihood, ll_list, sample_names,
-                                    "feature")
+        ll_ds = xr.concat(ll_list, concatenation_name)
         group_dict["log_likelihood"] = ll_ds
     if posterior_predictive is not None:
-        pp_ds = _concat_table_draws(posterior_predictive, pp_list,
-                                    sample_names, "feature")
+        pp_ds = xr.concat(pp_list, concatenation_name)
         group_dict["posterior_predictive"] = pp_ds
 
     all_group_inferences = []
@@ -232,18 +209,3 @@ def _drop_data(
                 dims_to_drop.append(dim)
     new_dataset = new_dataset.drop_dims(dims_to_drop)
     return new_dataset
-
-
-def _concat_table_draws(
-    group: str,
-    da_list: Sequence[xr.DataArray],
-    sample_names: Sequence[str],
-    concatenation_name: str
-) -> xr.Dataset:
-    """For posterior predictive & log likelihood."""
-    ds = xr.concat(da_list, concatenation_name)
-    dim_name = f"{group}_dim_0"
-    ds = ds.rename_dims({dim_name: "tbl_sample"})
-    ds = ds.assign_coords({"tbl_sample": sample_names})
-    ds = ds.reset_coords([dim_name], drop=True)
-    return ds
diff --git a/birdman/templates/multinomial.stan b/birdman/templates/multinomial.stan
@@ -2,7 +2,7 @@ data {
   int<lower=0> N;       // number of samples
   int<lower=0> D;       // number of dimensions
   int<lower=0> p;       // number of covariates
-  real depth[N];        // sequencing depths of microbes
+  int depth[N];         // sequencing depths of microbes
   matrix[N, p] x;       // covariate matrix
   int y[N, D];          // observed microbe abundances
   real<lower=0> B_p;    // stdev for Beta Normal prior
@@ -19,9 +19,8 @@ transformed parameters {
   vector[N] z;
   simplex[D] theta[N];
 
-  z = to_vector(rep_array(0, N));
   lam = x * beta;
-  lam_clr = append_col(z, lam);
+  lam_clr = append_col(to_vector(rep_array(0, N)), lam);
   for (n in 1:N){
     theta[n] = softmax(to_vector(lam_clr[n,]));
   }

diff --git a/birdman/templates/negative_binomial.stan b/birdman/templates/negative_binomial.stan
@@ -2,7 +2,7 @@ data {
   int<lower=0> N;       // number of samples
   int<lower=0> D;       // number of dimensions
   int<lower=0> p;       // number of covariates
-  real depth[N];        // sequencing depths of microbes
+  real depth[N];        // log sequencing depths of microbes
   matrix[N, p] x;       // covariate matrix
   int y[N, D];          // observed microbe abundances
   real<lower=0> B_p;    // stdev for Beta Normal prior
@@ -18,16 +18,14 @@ parameters {
 transformed parameters {
   matrix[N, D-1] lam;
   matrix[N, D] lam_clr;
-  vector[N] z;
   vector<lower=0>[D] phi;
 
   for (i in 1:D){
     phi[i] = 1. / reciprocal_phi[i];
   }
 
-  z = to_vector(rep_array(0, N));
   lam = x * beta;
-  lam_clr = append_col(z, lam);
+  lam_clr = append_col(to_vector(rep_array(0, N)), lam);
 }
 
 model {
@@ -50,12 +48,12 @@ model {
 
 generated quantities {
   matrix[N, D] y_predict;
-  matrix[N, D] log_lik;
+  matrix[N, D] log_lhood;
 
   for (n in 1:N){
     for (i in 1:D){
       y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]);
-      log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
+      log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
     }
   }
 }
diff --git a/birdman/templates/negative_binomial_lme.stan b/birdman/templates/negative_binomial_lme.stan
@@ -3,7 +3,7 @@ data {
   int<lower=0> S;                     // number of groups (subjects)
   int<lower=0> D;                     // number of dimensions
   int<lower=0> p;                     // number of covariates
-  real depth[N];                      // sequencing depths of microbes
+  real depth[N];                      // log sequencing depths of microbes
   matrix[N, p] x;                     // covariate matrix
   int<lower=1, upper=S> subj_ids[N];  // mapping of samples to subject IDs
   int y[N, D];                        // observed microbe abundances
@@ -58,12 +58,12 @@ model {
 
 generated quantities {
   matrix[N, D] y_predict;
-  matrix[N, D] log_lik;
+  matrix[N, D] log_lhood;
 
   for (n in 1:N){
     for (i in 1:D){
       y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]);
-      log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
+      log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
     }
   }
 }
diff --git a/birdman/templates/negative_binomial_single.stan b/birdman/templates/negative_binomial_single.stan
@@ -1,7 +1,7 @@
 data {
   int<lower=0> N;       // number of samples
   int<lower=0> p;       // number of covariates
-  real depth[N];        // sequencing depths of microbe
+  real depth[N];        // log sequencing depths of microbe
   matrix[N, p] x;       // covariate matrix
   int y[N];             // observed microbe abundances
   real<lower=0> B_p;    // stdev for Beta Normal prior
@@ -35,11 +35,11 @@ model {
 }
 
 generated quantities {
-  vector[N] log_lik;
+  vector[N] log_lhood;
   vector[N] y_predict;
 
   for (n in 1:N){
     y_predict[n] = neg_binomial_2_log_rng(depth[n] + lam[n], phi);
-    log_lik[n] = neg_binomial_2_log_lpmf(y[n] | depth[n] + lam[n], phi);
+    log_lhood[n] = neg_binomial_2_log_lpmf(y[n] | depth[n] + lam[n], phi);
   }
 }
diff --git a/docs/default_model_example.rst b/docs/default_model_example.rst
@@ -65,25 +65,9 @@ We then have to compile and fit our model. This is very straightforward in BIRDM
 
 Now we have our parameter estimates which we can use in downstream analyses. Many of BIRDMAn's included analysis & visualization functions take an ``arviz.InferenceData`` object. We provide a simple method to convert your BIRDMAn fit into this data structure.
 
-The ``coords`` & ``dims`` arguments follow the ``xarray`` data structure (as that is what ``arviz`` wraps). In this example we want to keep the fitted ``beta`` and ``phi`` parameters and we annotate the dimensions as ``covariate × feature`` and ``feature`` respectively. We also provide the covariate names and feature names. Next, we specify that we want to include the ``log likelihood`` and ``posterior predictive`` values we calculated while model fitting. These will be useful for diagnostics. We include the observed data (the original counts in the feature table) so we can see how well our model did.
-
 .. code-block:: python
 
-    inference = nb.to_inference_object(
-        params=["beta", "phi"],
-        coords={
-            "feature": nb.feature_names,
-            "covariate": nb.colnames,
-        },
-        dims={
-            "beta": ["covariate", "feature"],
-            "phi": ["feature"],
-        },
-        alr_params=["beta"],
-        posterior_predictive="y_predict",
-        log_likelihood="log_lik",
-        include_observed_data=True
-    )
+    inference = nb.to_inference_object()
 
 Finally, we'll plot the feature differentials and their standard deviations. We specify that we are interested in the ``diet[T.DIO]`` differentials but you can easily plot whichever parameter you like through the combination of the ``parameter`` and ``coord`` arguments.