Implement compute_log_likelihood method (#769)

bambinos · Apr 1, 2024 · ed1e936 · ed1e936
1 parent 714ccb7
commit ed1e936
Show file tree

Hide file tree

Showing 7 changed files with 285 additions and 66 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * Interpet submodule now outputs informative messages when computing default values (#745) 
 * Bambi supports weighted responses (#761)
 * Bambi supports constrained responses (#764)
+* Implement `compute_log_likelihood()` method to compute the log likelihood on a model (#769)
 
 ### Maintenance and fixes
 

diff --git a/bambi/families/family.py b/bambi/families/family.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from bambi.families.link import Link
-from bambi.utils import get_auxiliary_parameters, get_aliased_name
+from bambi.utils import get_auxiliary_parameters, get_aliased_name, response_evaluate_new_data
 
 
 class Family:
@@ -133,68 +133,146 @@ def posterior_predictive(self, model, posterior, **kwargs):
         """
         response_dist = get_response_dist(model.family)
         response_term = model.response_component.response_term
-        params = model.family.likelihood.params
+        kwargs, coords = self._make_dist_kwargs_and_coords(model, posterior, **kwargs)
+
+        # Handle constrained responses
+        if response_term.is_constrained:
+            # Bounds are scalars, we can safely pick them from the first row
+            lower, upper = response_term.data[0, 1:]
+            lower = lower if lower != -np.inf else None
+            upper = upper if upper != np.inf else None
+            output_array = pm.draw(
+                pm.Truncated.dist(response_dist.dist(**kwargs), lower=lower, upper=upper)
+            )
+        else:
+            output_array = pm.draw(response_dist.dist(**kwargs))
+
+        return xr.DataArray(output_array, coords=coords)
+
+    def log_likelihood(self, model, posterior, data, **kwargs):
+        """Evaluate the model log-likelihood
+
+        This method uses `pm.logp()`.
+
+        Parameters
+        ----------
+        model : bambi.Model
+            The model
+        posterior : xr.Dataset
+            The xarray dataset that contains the draws for all the parameters in the posterior.
+            It must contain the parameters that are needed in the distribution of the response, or
+            the parameters that allow to derive them.
+        kwargs :
+            Parameters that are used to get draws but do not appear in the posterior object or
+            other configuration parameters.
+            For instance, the 'n' in binomial models and multinomial models.
+
+        Returns
+        -------
+        xr.DataArray
+            A data array with the value of the log-likelihood for each chain, draw, and value
+            of the response variable.
+        """
+        # Child classes pass "y_values" through the "y" kwarg
+        y_values = kwargs.pop("y", None)
+
+        # Get the values of the outcome variable
+        if y_values is None:  # when it's not handled by the specific family
+            if data is None:
+                y_values = np.squeeze(model.response_component.response_term.data)
+            else:
+                y_values = response_evaluate_new_data(model, data)
+
+        response_dist = get_response_dist(model.family)
+        response_term = model.response_component.response_term
+        kwargs, coords = self._make_dist_kwargs_and_coords(model, posterior, **kwargs)
+
+        # If it's multivariate, it's going to have a fourth coord, but we actually don't need it
+        # We just need "chain", "draw", "obs"
+        coords = dict(list(coords.items())[:3])
+
+        # Handle constrained responses
+        if response_term.is_constrained:
+            # Bounds are scalars, we can safely pick them from the first row
+            lower, upper = response_term.data[0, 1:]
+            lower = lower if lower != -np.inf else None
+            upper = upper if upper != np.inf else None
+            output_array = pm.logp(
+                pm.Truncated.dist(response_dist.dist(**kwargs), lower=lower, upper=upper), y_values
+            ).eval()
+        else:
+            output_array = pm.logp(response_dist.dist(**kwargs), y_values).eval()
+
+        return xr.DataArray(output_array, coords=coords)
+
+    def _make_dist_kwargs_and_coords(self, model, posterior, **kwargs):
+        """Get kwargs and coordinates
+
+        This utility method generates two things:
+
+        * A dictionary that maps the names of the likelihood parameters to draws from the
+        posterior distribtuion.
+        * An `xr.Coordinates` object with the coordinates required for the response. For example:
+        `(chain, draw, y_obs)` or `(chain, draw, y_obs, y_dim)`.
+
+        It was created to abstract repetitive logic used in both `.posterior_predictive()` and
+        `log_likelihood()`.
+        """
+        response_term = model.response_component.response_term
         response_aliased_name = get_aliased_name(response_term)
+        params = model.family.likelihood.params
+
+        # Remove the 'data' kwarg
+        kwargs.pop("data", None)
 
-        kwargs.pop("data", None)  # Remove the 'data' kwarg
+        # Get list of variables to ignore when reshaping
         dont_reshape = kwargs.pop("dont_reshape", [])
-        output_dataset_list = []
 
-        # In the posterior xr.Dataset we need to consider aliases,
-        # but we don't use aliases when passing kwargs to the PyMC distribution.
+        # Collect coordinates from all the likelihood parameters
+        params_coords = xr.Coordinates()
+
         for param in params:
-            # Extract posterior draws for the parent parameter
+            # In the posterior xr.Dataset we need to consider aliases,
+            # but we can't use aliases when passing kwargs to the PyMC distribution.
             if param == model.family.likelihood.parent:
                 component = model.components[model.response_name]
                 var_name = response_aliased_name + "_mean"
-                kwargs[param] = posterior[var_name].to_numpy()
-                output_dataset_list.append(posterior[var_name])
             else:
-                # Extract posterior draws for non-parent parameters
                 component = model.components[param]
                 if component.alias:
                     var_name = component.alias
                 else:
                     var_name = f"{response_aliased_name}_{param}"
 
-                if var_name in posterior:
-                    kwargs[param] = posterior[var_name].to_numpy()
-                    output_dataset_list.append(posterior[var_name])
-                elif hasattr(component, "prior") and isinstance(component.prior, (int, float)):
-                    kwargs[param] = np.asarray(component.prior)
-                else:
-                    raise ValueError(
-                        "Non-parent parameter not found in posterior."
-                        "This error shouldn't have happened!"
-                    )
+            # Get posterior draws or a constant array if it was set to a constant in the prior
+            if var_name in posterior:
+                kwargs[param] = posterior[var_name].to_numpy()
+                params_coords = params_coords.merge(posterior[var_name].coords)
+            elif hasattr(component, "prior") and isinstance(component.prior, (int, float)):
+                kwargs[param] = np.asarray(component.prior)
+            else:
+                raise ValueError(
+                    "Non-parent parameter not found in posterior."
+                    "This error shouldn't have happened!"
+                )
 
         # Determine the array with largest number of dimensions
         ndims_max = max(x.ndim for x in kwargs.values())
 
-        # Append a dimension when needed. Required to make `pm.draw()` work.
+        # Append a dimension when needed. Required to make `pm.logp()` and `pm.draw()` work.
         # However, some distributions like Multinomial, require some parameters to be of a smaller
         # dimension than others (n.ndim <= p.ndim - 1) so we don't reshape those.
         for key, values in kwargs.items():
             if key in dont_reshape:
                 continue
             kwargs[key] = expand_array(values, ndims_max)
 
+        # Sometimes the model uses a parametrization but we evaluate logp using another one
         if hasattr(model.family, "transform_kwargs"):
             kwargs = model.family.transform_kwargs(kwargs)
 
-        # Handle constrained responses
-        if response_term.is_constrained:
-            # Bounds are scalars, we can safely pick them from the first row
-            lower, upper = response_term.data[0, 1:]
-            lower = lower if lower != -np.inf else None
-            upper = upper if upper != np.inf else None
-            output_array = pm.draw(
-                pm.Truncated.dist(response_dist.dist(**kwargs), lower=lower, upper=upper)
-            )
-        else:
-            output_array = pm.draw(response_dist.dist(**kwargs))
-
-        output_coords_all = xr.merge(output_dataset_list).coords
+        # Get the actual coords as 'params_coords' is an object of type Dataset
+        params_coords = params_coords.coords
 
         coord_names = ["chain", "draw", response_aliased_name + "_obs"]
         is_multivariate = hasattr(model.family, "KIND") and model.family.KIND == "Multivariate"
@@ -204,12 +282,13 @@ def posterior_predictive(self, model, posterior, **kwargs):
         elif hasattr(model.family, "create_extra_pps_coord"):
             new_coords = model.family.create_extra_pps_coord()
             coord_names.append(response_aliased_name + "_dim")
-            output_coords_all[response_aliased_name + "_dim"] = new_coords
+            params_coords[response_aliased_name + "_dim"] = new_coords
 
-        output_coords = {}
+        coords = {}
         for coord_name in coord_names:
-            output_coords[coord_name] = output_coords_all[coord_name]
-        return xr.DataArray(output_array, coords=output_coords)
+            coords[coord_name] = params_coords[coord_name]
+
+        return kwargs, coords
 
     def __str__(self):
         msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"]

diff --git a/bambi/families/multivariate.py b/bambi/families/multivariate.py
@@ -5,7 +5,7 @@
 
 from bambi.families.family import Family
 from bambi.transformations import transformations_namespace
-from bambi.utils import extract_argument_names, get_aliased_name
+from bambi.utils import extract_argument_names, get_aliased_name, response_evaluate_new_data
 
 
 class MultivariateFamily(Family):
@@ -40,6 +40,23 @@ def posterior_predictive(self, model, posterior, **kwargs):
         dont_reshape = ["n"]
         return super().posterior_predictive(model, posterior, n=n, dont_reshape=dont_reshape)
 
+    def log_likelihood(self, model, posterior, data, **kwargs):
+        if data is None:
+            y = model.response_component.response_term.data
+            trials = model.response_component.response_term.data.sum(1).astype(int)
+        else:
+            y = response_evaluate_new_data(model, data).astype(int)
+            trials = y.sum(1).astype(int)
+
+        # Prepend 'draw' and 'chain' dimensions
+        y = y[np.newaxis, np.newaxis, :]
+        trials = trials[np.newaxis, np.newaxis, :]
+
+        dont_reshape = ["n"]
+        return super().log_likelihood(
+            model, posterior, data=None, y=y, n=trials, dont_reshape=dont_reshape, **kwargs
+        )
+
     def get_coords(self, response):
         # For the moment, it always uses the first column as reference.
         name = get_aliased_name(response) + "_reduced_dim"

diff --git a/bambi/families/univariate.py b/bambi/families/univariate.py
@@ -4,7 +4,7 @@
 import xarray as xr
 
 from bambi.families.family import Family
-from bambi.utils import get_aliased_name
+from bambi.utils import get_aliased_name, response_evaluate_new_data
 
 
 class UnivariateFamily(Family):
@@ -22,6 +22,20 @@ def posterior_predictive(self, model, posterior, **kwargs):
         trials = trials[np.newaxis, np.newaxis, :]
         return super().posterior_predictive(model, posterior, n=trials)
 
+    def log_likelihood(self, model, posterior, data, **kwargs):
+        if data is None:
+            y = model.response_component.response_term.data[:, 0]
+            trials = model.response_component.response_term.data[:, 1]
+        else:
+            output = response_evaluate_new_data(model, data).astype(int)
+            y = output[:, 0]
+            trials = output[:, 1]
+
+        # Prepend 'draw' and 'chain' dimensions
+        y = y[np.newaxis, np.newaxis, :]
+        trials = trials[np.newaxis, np.newaxis, :]
+        return super().log_likelihood(model, posterior, data=None, y=y, n=trials, **kwargs)
+
     @staticmethod
     def transform_backend_kwargs(kwargs):
         observed = kwargs.pop("observed")