diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 704ff95e..7ca002e7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -61,4 +61,4 @@ jobs:
           python --version
           python -m pytest --version
           firedrake-status
-          python -m pytest -v -n 6 --durations=40 --timeout=600 --cov=asQ --cov-report=term tests/
+          python -m pytest -v -n 6 --durations=40 --timeout=420 --cov=asQ --cov-report=term tests/
diff --git a/asQ/allatonce/form.py b/asQ/allatonce/form.py
index 6aa2083f..e6a1e755 100644
--- a/asQ/allatonce/form.py
+++ b/asQ/allatonce/form.py
@@ -1,8 +1,7 @@
 import firedrake as fd
-from firedrake.petsc import PETSc
 
 from asQ.profiling import profiler
-from asQ.allatonce import AllAtOnceFunction, AllAtOnceCofunction
+from asQ.allatonce import AllAtOnceCofunction
 from asQ.allatonce.mixin import TimePartitionMixin
 
 from functools import partial
@@ -59,8 +58,9 @@ def __init__(self,
         for bc in self.bcs:
             bc.apply(aaofunc.function)
 
-        # function to assemble the nonlinear residual into
-        self.F = aaofunc.copy(copy_values=False).zero()
+        # cofunction to assemble the nonlinear residual into
+        self.F = AllAtOnceCofunction(self.ensemble, self.time_partition,
+                                     aaofunc.field_function_space.dual())
 
         self.form = self._construct_form()
 
@@ -137,46 +137,39 @@ def assemble(self, func=None, tensor=None):
 
         :arg func: may optionally be an AllAtOnceFunction or a global PETSc Vec.
             if not None, the form will be evaluated at the state in `func`.
-        :arg tensor: may optionally be an AllAtOnceFunction, AllAtOnceCofunction,
-            a global PETSc Vec, or a Function in AllAtOnceFunction.function_space.
-            if not None, the result will be placed into `tensor`.
+        :arg tensor: may optionally be an AllAtOnceCofunction, in which case
+            the result will be placed into `tensor`.
         """
-        # set current state
+        if tensor is not None and not isinstance(tensor, AllAtOnceCofunction):
+            raise TypeError(f"tensor must be an AllAtOnceCofunction, not {type(tensor)}")
+
+        # Set the current state
         if func is not None:
             self.aaofunc.assign(func, update_halos=False)
-        self.aaofunc.update_time_halos()
-
-        # assembly stage
-        fd.assemble(self.form, tensor=self.F.function)
 
-        # apply boundary conditions
+        # Assembly stage
+        # The residual on the DirichletBC nodes is set to zero,
+        # so we need to make sure that the function conforms
+        # with the boundary conditions.
         for bc in self.bcs:
-            bc.apply(self.F.function, u=self.aaofunc.function)
-
-        # copy into return buffer
-
-        if isinstance(tensor, AllAtOnceFunction):
-            tensor.assign(self.F)
+            bc.apply(self.aaofunc.function)
 
-        elif isinstance(tensor, fd.Function):
-            tensor.assign(self.F.function)
-
-        elif isinstance(tensor, AllAtOnceCofunction):
-            with self.F.global_vec_ro() as fvec:
-                with tensor.global_vec_wo() as tvec:
-                    fvec.copy(tvec)
-
-        elif isinstance(tensor, fd.Cofunction):
-            with self.F.function.dat.vec_ro as fvec:
-                with tensor.dat.vec_wo as tvec:
-                    fvec.copy(tvec)
+        # Update the halos after enforcing the bcs so we
+        # know they are correct. This doesn't make a
+        # difference now because we only support constant
+        # bcs, but it will be important once we support
+        # time-dependent bcs.
+        self.aaofunc.update_time_halos()
 
-        elif isinstance(tensor, PETSc.Vec):
-            with self.F.global_vec_ro() as fvec:
-                fvec.copy(tensor)
+        fd.assemble(self.form, bcs=self.bcs,
+                    tensor=self.F.cofunction)
 
-        elif tensor is not None:
-            raise TypeError(f"tensor must be AllAtOnceFunction, AllAtOnceCofunction, Function, Cofunction, or PETSc.Vec, not {type(tensor)}")
+        if tensor:
+            tensor.assign(self.F)
+            result = tensor
+        else:
+            result = self.F
+        return result
 
     def _construct_form(self):
         """
diff --git a/asQ/allatonce/jacobian.py b/asQ/allatonce/jacobian.py
index 6c20a788..55a3a02c 100644
--- a/asQ/allatonce/jacobian.py
+++ b/asQ/allatonce/jacobian.py
@@ -55,8 +55,11 @@ def __init__(self, aaoform,
         self.x = aaofunc.copy()
 
         # output residual, and contribution from timestep at end of previous slice
-        self.F = aaofunc.copy()
-        self.Fprev = fd.Function(aaofunc.function_space)
+        self.F = aaoform.F.copy(copy_values=False)
+        self.Fprev = fd.Cofunction(self.F.function_space)
+
+        self.bcs = aaoform.bcs
+        self.field_bcs = aaoform.field_bcs
 
         # working buffers for calculating time average when needed
         self.ureduce = fd.Function(aaofunc.field_function_space)
@@ -138,23 +141,46 @@ def mult(self, mat, X, Y):
         :arg X: a PETSc Vec to apply the action on.
         :arg Y: a PETSc Vec for the result.
         """
-
         # we could use nonblocking here and overlap comms with assembling form
         self.x.assign(X, update_halos=True, blocking=True)
 
+        # We use the same strategy as the implicit matrix context in firedrake
+        # for dealing with the boundary nodes. From the comments in that file:
+
+        # The matrix has an identity block corresponding to the Dirichlet
+        # boundary conditions.
+        # Our algorithm in this case is to save the BC values, zero them
+        # out before computing the action so that they don't pollute
+        # anything, and then set the values into the result.
+        # This has the effect of applying [ A_ii 0 ; 0 A_bb ] where A_ii
+        # is the block corresponding only to (non-fixed) internal dofs
+        # and A_bb=I is the identity block on the (fixed) boundary dofs.
+
+        # Zero the boundary nodes on the input so that A_ib = A_01 = 0
+        for bc in self.bcs:
+            bc.zero(self.x.function)
+
         # assembly stage
-        fd.assemble(self.action, tensor=self.F.function)
+        fd.assemble(self.action, bcs=self.bcs,
+                    tensor=self.F.cofunction)
+
         if self._useprev:
-            fd.assemble(self.action_prev, tensor=self.Fprev)
-            self.F.function += self.Fprev
-
-        # Apply boundary conditions
-        # For Jacobian action we should just return the values in X
-        # at boundary nodes
-        for bc in self.aaoform.bcs:
-            bc.homogenize()
-            bc.apply(self.F.function, u=self.x.function)
-            bc.restore()
+            # repeat for the halo part of the matrix action
+            for bc in self.field_bcs:
+                bc.zero(self.x.uprev)
+            fd.assemble(self.action_prev, bcs=self.bcs,
+                        tensor=self.Fprev)
+            self.F.cofunction += self.Fprev
+
+        if len(self.bcs) > 0:
+            Fbuf = self.Fprev  # just using Fprev as a working buffer
+            # Get the original values again
+            with Fbuf.dat.vec_wo as fvec:
+                X.copy(fvec)
+            # Set the output boundary nodes to the input boundary nodes.
+            # This is equivalent to setting [A_bi, A_bb] = [0 I]
+            for bc in self.bcs:
+                bc.set(self.F.cofunction, Fbuf)
 
         with self.F.global_vec_ro() as v:
             v.copy(Y)
diff --git a/asQ/allatonce/solver.py b/asQ/allatonce/solver.py
index 7b5a8344..0df08826 100644
--- a/asQ/allatonce/solver.py
+++ b/asQ/allatonce/solver.py
@@ -90,15 +90,15 @@ def passthrough(*args, **kwargs):
 
         self.snes.setOptionsPrefix(options_prefix)
 
-        # residual vector
-        self.F = aaofunc._vec.duplicate()
-
         def assemble_function(snes, X, F):
             self.pre_function_callback(self, X)
-            self.aaoform.assemble(X, tensor=F)
+            self.aaoform.assemble(X)
+            with aaoform.F.global_vec_ro() as fvec:
+                fvec.copy(F)
             self.post_function_callback(self, X, F)
 
-        self.snes.setFunction(assemble_function, self.F)
+        self._F = aaoform.F._vec.duplicate()
+        self.snes.setFunction(assemble_function, self._F)
 
         # Jacobian
         with self.options.inserted_options():
diff --git a/asQ/complex_proxy/vector_impl.py b/asQ/complex_proxy/vector_impl.py
index 550cf108..1f0bf39c 100644
--- a/asQ/complex_proxy/vector_impl.py
+++ b/asQ/complex_proxy/vector_impl.py
@@ -75,15 +75,11 @@ def DirichletBC(W, V, bc, function_arg=None):
     if function_arg is None:
         function_arg = bc.function_arg
 
-    sub_domain = bc.sub_domain
-
     if type(V.ufl_element()) is fd.MixedElement:
-        idx = bc.function_space().index
-        Ws = (W.sub(idx).sub(0), W.sub(idx).sub(1))
-    else:
-        Ws = (W.sub(0), W.sub(1))
+        W = W.sub(bc.function_space().index)
 
-    return tuple(fd.DirichletBC(Wsub, function_arg, sub_domain) for Wsub in Ws)
+    return tuple(fd.DirichletBC(W.sub(part), function_arg, bc.sub_domain)
+                 for part in (re, im))
 
 
 def split(u, i):
diff --git a/tests/allatonce/test_allatonceform.py b/tests/allatonce/test_allatonceform.py
index 44477948..cd48c3ce 100644
--- a/tests/allatonce/test_allatonceform.py
+++ b/tests/allatonce/test_allatonceform.py
@@ -6,11 +6,7 @@
 from operator import mul
 
 
-def assemble(form):
-    return fd.assemble(form).riesz_representation(riesz_map='l2')
-
-
-bc_opts = ["no_bcs", "homogeneous_bcs", "inhomogeneous_bcs"]
+bc_options = ["no_bcs", "homogeneous_bcs", "inhomogeneous_bcs"]
 
 alphas = [pytest.param(None, id="alpha_None"),
           pytest.param(0, id="alpha_0"),
@@ -18,9 +14,9 @@ def assemble(form):
 
 
 @pytest.mark.parallel(nprocs=4)
-@pytest.mark.parametrize("bc_opt", bc_opts)
+@pytest.mark.parametrize("bc_option", bc_options)
 @pytest.mark.parametrize("alpha", alphas)
-def test_heat_form(bc_opt, alpha):
+def test_heat_form(bc_option, alpha):
     """
     Test that assembling the AllAtOnceForm is the same as assembling the
     slice-local part of an all-at-once form for the whole timeseries.
@@ -28,7 +24,8 @@ def test_heat_form(bc_opt, alpha):
     """
 
     # build the all-at-once function
-    nslices = fd.COMM_WORLD.size//2
+    nspace_ranks = 1
+    nslices = fd.COMM_WORLD.size//nspace_ranks
     slice_length = 2
 
     time_partition = tuple((slice_length for _ in range(nslices)))
@@ -61,11 +58,11 @@ def form_function(u, v, t):
     def form_mass(u, v):
         return u*v*fd.dx
 
-    if bc_opt == "inhomogeneous_bcs":
+    if bc_option == "inhomogeneous_bcs":
         bc_val = fd.sin(2*fd.pi*x)
         bc_domain = "on_boundary"
         bcs = [fd.DirichletBC(V, bc_val, bc_domain)]
-    elif bc_opt == "homogeneous_bcs":
+    elif bc_option == "homogeneous_bcs":
         bc_val = 0.
         bc_domain = 1
         bcs = [fd.DirichletBC(V, bc_val, bc_domain)]
@@ -83,12 +80,12 @@ def form_mass(u, v):
     full_function_space = reduce(mul, (V for _ in range(sum(time_partition))))
     ufull = fd.Function(full_function_space)
 
-    if bc_opt == "no_bcs":
+    if bc_option == "no_bcs":
         bcs_full = []
     else:
-        bcs_full = []
-        for i in range(sum(time_partition)):
-            bcs_full.append(fd.DirichletBC(full_function_space.sub(i), bc_val, bc_domain))
+        bcs_full = [fd.DirichletBC(full_function_space.sub(i),
+                                   bc_val, bc_domain)
+                    for i in range(sum(time_partition))]
 
     vfull = fd.TestFunction(full_function_space)
     ufulls = fd.split(ufull)
@@ -119,21 +116,25 @@ def form_mass(u, v):
 
     # assemble and compare
     aaoform.assemble()
-    Ffull = assemble(fullform)
+
+    # aaoform.assemble enforces boundary
+    # conditions, but firedrake.assemble doesn't
+    # so we need to manually enforce them here.
     for bc in bcs_full:
-        bc.apply(Ffull, u=ufull)
+        bc.apply(ufull)
+    Ffull = fd.assemble(fullform, bcs=bcs_full)
 
     for step in range(aaofunc.nlocal_timesteps):
         windx = aaofunc.transform_index(step, from_range='slice', to_range='window')
         userial = Ffull.subfunctions[windx]
         uparallel = aaoform.F[step].subfunctions[0]
-        err = fd.errornorm(userial, uparallel)
-        assert (err < 1e-12), "Each component of AllAtOnceForm residual should match component of monolithic residual calculated locally"
+        for pdat, sdat in zip(uparallel.dat, userial.dat):
+            assert np.allclose(pdat.data, sdat.data), "Each component of AllAtOnceForm residual should match component of monolithic residual calculated locally"
 
 
-@pytest.mark.parametrize("bc_opt", bc_opts)
+@pytest.mark.parametrize("bc_option", bc_options)
 @pytest.mark.parallel(nprocs=4)
-def test_mixed_heat_form(bc_opt):
+def test_mixed_heat_form(bc_option):
     """
     Test that assembling the AllAtOnceForm is the same as assembling the
     slice-local part of an all-at-once form for the whole timeseries.
@@ -172,11 +173,11 @@ def form_function(u, p, v, q, t):
     def form_mass(u, p, v, q):
         return (fd.inner(u, v) + p*q)*fd.dx
 
-    if bc_opt == "inhomogeneous_bcs":
+    if bc_option == "inhomogeneous_bcs":
         bc_val = fd.as_vector([fd.sin(2*fd.pi*x), -fd.cos(fd.pi*y)])
         bc_domain = "on_boundary"
         bcs = [fd.DirichletBC(V.sub(0), bc_val, bc_domain)]
-    elif bc_opt == "homogeneous_bcs":
+    elif bc_option == "homogeneous_bcs":
         bc_val = fd.as_vector([0., 0.])
         bc_domain = "on_boundary"
         bcs = [fd.DirichletBC(V.sub(0), bc_val, bc_domain)]
@@ -191,14 +192,12 @@ def form_mass(u, p, v, q):
     full_function_space = reduce(mul, (V for _ in range(sum(time_partition))))
     ufull = fd.Function(full_function_space)
 
-    if bc_opt == "no_bcs":
+    if bc_option == "no_bcs":
         bcs_full = []
     else:
-        bcs_full = []
-        for i in range(sum(time_partition)):
-            bcs_full.append(fd.DirichletBC(full_function_space.sub(2*i),
-                                           bc_val,
-                                           bc_domain))
+        bcs_full = [fd.DirichletBC(full_function_space.sub(2*i),
+                                   bc_val, bc_domain)
+                    for i in range(sum(time_partition))]
 
     vfull = fd.TestFunction(full_function_space)
     ufulls = fd.split(ufull)
@@ -233,9 +232,13 @@ def form_mass(u, p, v, q):
 
     # assemble and compare
     aaoform.assemble()
-    Ffull = assemble(fullform)
+
+    # aaoform.assemble enforces boundary
+    # conditions, but firedrake.assemble doesn't
+    # so we need to manually enforce them here.
     for bc in bcs_full:
-        bc.apply(Ffull, u=ufull)
+        bc.apply(ufull)
+    Ffull = fd.assemble(fullform, bcs=bcs_full)
 
     for step in range(aaofunc.nlocal_timesteps):
         windices = aaofunc._component_indices(step, to_range='window')
@@ -243,8 +246,8 @@ def form_mass(u, p, v, q):
             windx = windices[cpt]
             userial = Ffull.subfunctions[windx]
             uparallel = aaoform.F[step].subfunctions[cpt]
-            err = fd.errornorm(userial, uparallel)
-            assert (err < 1e-12), "Each component of AllAtOnceForm residual should match component of monolithic residual calculated locally"
+            for pdat, sdat in zip(uparallel.dat, userial.dat):
+                assert np.allclose(pdat.data, sdat.data), "Each component of AllAtOnceForm residual should match component of monolithic residual calculated locally"
 
 
 @pytest.mark.parallel(nprocs=4)
diff --git a/tests/allatonce/test_allatoncejacobian.py b/tests/allatonce/test_allatoncejacobian.py
index bc9717ac..41ea79ec 100644
--- a/tests/allatonce/test_allatoncejacobian.py
+++ b/tests/allatonce/test_allatoncejacobian.py
@@ -6,12 +6,16 @@
 from operator import mul
 
 
-def assemble(form):
-    return fd.assemble(form).riesz_representation(riesz_map='l2')
+def assemble(form, *args, **kwargs):
+    return fd.assemble(form, *args, **kwargs).riesz_representation(riesz_map='l2')
+
+
+bc_options = ["no_bcs", "homogeneous_bcs", "inhomogeneous_bcs"]
 
 
 @pytest.mark.parallel(nprocs=4)
-def test_heat_jacobian():
+@pytest.mark.parametrize("bc_option", bc_options)
+def test_heat_jacobian(bc_option):
     """
     Test that the AllAtOnceJacobian is the same as the action of the
     slice-local part of the derivative of an all-at-once form for
@@ -19,13 +23,14 @@ def test_heat_jacobian():
     """
 
     # build the all-at-once function
-    nslices = fd.COMM_WORLD.size//2
+    nspace_ranks = 2
+    nslices = fd.COMM_WORLD.size//nspace_ranks
     slice_length = 2
 
     time_partition = tuple((slice_length for _ in range(nslices)))
     ensemble = asQ.create_ensemble(time_partition, comm=fd.COMM_WORLD)
 
-    mesh = fd.UnitSquareMesh(4, 4, comm=ensemble.comm)
+    mesh = fd.UnitSquareMesh(4, 4, quadrilateral=True, comm=ensemble.comm)
     x, y = fd.SpatialCoordinate(mesh)
     V = fd.FunctionSpace(mesh, "CG", 1)
 
@@ -50,8 +55,20 @@ def form_function(u, v, t):
     def form_mass(u, v):
         return u*v*fd.dx
 
+    if bc_option == "inhomogeneous_bcs":
+        bc_val = fd.sin(2*fd.pi*x)
+        bc_domain = "on_boundary"
+        bcs = [fd.DirichletBC(V, bc_val, bc_domain)]
+    elif bc_option == "homogeneous_bcs":
+        bc_val = 0.
+        bc_domain = 1
+        bcs = [fd.DirichletBC(V, bc_val, bc_domain)]
+    else:
+        bcs = []
+
     aaoform = asQ.AllAtOnceForm(aaofunc, dt, theta,
-                                form_mass, form_function)
+                                form_mass, form_function,
+                                bcs=bcs)
 
     # build the all-at-once jacobian
     aaojac = asQ.AllAtOnceJacobian(aaoform)
@@ -60,6 +77,13 @@ def form_mass(u, v):
     full_function_space = reduce(mul, (V for _ in range(sum(time_partition))))
     ufull = fd.Function(full_function_space)
 
+    if bc_option == "no_bcs":
+        bcs_full = []
+    else:
+        bcs_full = [fd.DirichletBC(full_function_space.sub(i),
+                                   bc_val, bc_domain)
+                    for i in range(sum(time_partition))]
+
     vfull = fd.TestFunction(full_function_space)
     ufulls = fd.split(ufull)
     vfulls = fd.split(vfull)
@@ -96,8 +120,8 @@ def form_mass(u, v):
         dat.data[:] = np.random.randn(*(dat.data.shape))
 
     # input/output vectors for aaojac
-    x = aaofunc.copy()
-    y = aaofunc.copy()
+    x = aaofunc.copy(copy_values=False)
+    y = aaoform.F.copy(copy_values=False)
     for step in range(aaofunc.nlocal_timesteps):
         windx = aaofunc.transform_index(step, from_range='slice', to_range='window')
         x[step].assign(xfull.subfunctions[windx])
@@ -107,20 +131,26 @@ def form_mass(u, v):
         aaojac.mult(None, xvec, yvec)
 
     # assemble the full jacobian
-    yfull = assemble(fd.action(full_jacobian, xfull))
+    jac_full = fd.assemble(full_jacobian, bcs=bcs_full,
+                           mat_type='matfree')
+    mat_full = jac_full.petscmat.getPythonContext()
 
-    # check they match
+    yfull = fd.Cofunction(full_function_space.dual())
+    with xfull.dat.vec_ro as xvec, yfull.dat.vec_wo as yvec:
+        mat_full.mult(None, xvec, yvec)
 
+    # check they match
     for step in range(aaofunc.nlocal_timesteps):
         windx = aaofunc.transform_index(step, from_range='slice', to_range='window')
         yserial = yfull.subfunctions[windx]
         yparallel = y[step].subfunctions[0]
-        err = fd.errornorm(yserial, yparallel)
-        assert (err < 1e-12), "Each component of AllAtOnceJacobian action should match component of monolithic action calculated locally"
+        for pdat, sdat in zip(yparallel.dat, yserial.dat):
+            assert np.allclose(pdat.data, sdat.data), f"Timestep {step} of AllAtOnceJacobian action doesn't match component of monolithic residual calculated locally"
 
 
 @pytest.mark.parallel(nprocs=4)
-def test_mixed_heat_jacobian():
+@pytest.mark.parametrize("bc_option", bc_options)
+def test_mixed_heat_jacobian(bc_option):
     """
     Test that the AllAtOnceJacobian is the same as the action of the
     slice-local part of the derivative of an all-at-once form for
@@ -128,7 +158,8 @@ def test_mixed_heat_jacobian():
     """
 
     # build the all-at-once function
-    nslices = fd.COMM_WORLD.size//2
+    nspace_ranks = 2
+    nslices = fd.COMM_WORLD.size//nspace_ranks
     slice_length = 2
 
     time_partition = tuple((slice_length for _ in range(nslices)))
@@ -160,8 +191,20 @@ def form_function(u, p, v, q, t):
     def form_mass(u, p, v, q):
         return (fd.inner(u, v) + p*q)*fd.dx
 
+    if bc_option == "inhomogeneous_bcs":
+        bc_val = fd.as_vector([fd.sin(2*fd.pi*x), -fd.cos(fd.pi*y)])
+        bc_domain = "on_boundary"
+        bcs = [fd.DirichletBC(V.sub(0), bc_val, bc_domain)]
+    elif bc_option == "homogeneous_bcs":
+        bc_val = fd.as_vector([0., 0.])
+        bc_domain = "on_boundary"
+        bcs = [fd.DirichletBC(V.sub(0), bc_val, bc_domain)]
+    else:
+        bcs = []
+
     aaoform = asQ.AllAtOnceForm(aaofunc, dt, theta,
-                                form_mass, form_function)
+                                form_mass, form_function,
+                                bcs=bcs)
 
     # build the all-at-once jacobian
     aaojac = asQ.AllAtOnceJacobian(aaoform)
@@ -170,6 +213,13 @@ def form_mass(u, p, v, q):
     full_function_space = reduce(mul, (V for _ in range(sum(time_partition))))
     ufull = fd.Function(full_function_space)
 
+    if bc_option == "no_bcs":
+        bcs_full = []
+    else:
+        bcs_full = [fd.DirichletBC(full_function_space.sub(2*i),
+                                   bc_val, bc_domain)
+                    for i in range(sum(time_partition))]
+
     vfull = fd.TestFunction(full_function_space)
     ufulls = fd.split(ufull)
     vfulls = fd.split(vfull)
@@ -210,8 +260,8 @@ def form_mass(u, p, v, q):
         dat.data[:] = np.random.randn(*(dat.data.shape))
 
     # input/output vectors for aaojac
-    x = aaofunc.copy()
-    y = aaofunc.copy()
+    x = aaofunc.copy(copy_values=False)
+    y = aaoform.F.copy(copy_values=False)
     for step in range(aaofunc.nlocal_timesteps):
         windices = aaofunc._component_indices(step, to_range='window')
         for cpt in range(2):
@@ -223,7 +273,13 @@ def form_mass(u, p, v, q):
         aaojac.mult(None, xvec, yvec)
 
     # assemble the full jacobian
-    yfull = assemble(fd.action(full_jacobian, xfull))
+    jac_full = fd.assemble(full_jacobian, bcs=bcs_full,
+                           mat_type='matfree')
+    mat_full = jac_full.petscmat.getPythonContext()
+
+    yfull = fd.Cofunction(full_function_space.dual())
+    with xfull.dat.vec_ro as xvec, yfull.dat.vec_wo as yvec:
+        mat_full.mult(None, xvec, yvec)
 
     for step in range(aaofunc.nlocal_timesteps):
         windices = aaofunc._component_indices(step, to_range='window')
@@ -231,5 +287,9 @@ def form_mass(u, p, v, q):
             windx = windices[cpt]
             yserial = yfull.subfunctions[windx]
             yparallel = y[step].subfunctions[cpt]
-            err = fd.errornorm(yserial, yparallel)
-            assert (err < 1e-12), "Each component of AllAtOnceJacobian action should match component of monolithic action calculated locally"
+            for pdat, sdat in zip(yparallel.dat, yserial.dat):
+                assert np.allclose(pdat.data, sdat.data), f"Timestep {step}, component {cpt}, of AllAtOnceJacobian action doesn't match component of monolithic residual calculated locally"
+
+
+if __name__ == '__main__':
+    pass
diff --git a/tests/allatonce/test_allatoncesolver.py b/tests/allatonce/test_allatoncesolver.py
index 41335de8..a5f84ad1 100644
--- a/tests/allatonce/test_allatoncesolver.py
+++ b/tests/allatonce/test_allatoncesolver.py
@@ -13,7 +13,8 @@ def test_solve_heat_equation_nopc():
 
     # set up space-time parallelism
 
-    nslices = fd.COMM_WORLD.size//2
+    nspace_ranks = 2
+    nslices = fd.COMM_WORLD.size//nspace_ranks
     slice_length = 2
 
     time_partition = tuple((slice_length for _ in range(nslices)))
@@ -46,22 +47,17 @@ def form_mass(u, v):
 
     # solver and options
 
-    atol = 1.0e-10
+    atol = 1e-10
     solver_parameters = {
         'snes_type': 'ksponly',
-        'snes': {
-            'monitor': None,
-            'converged_reason': None,
-        },
         'pc_type': 'none',
         'ksp_type': 'gmres',
         'mat_type': 'matfree',
         'ksp': {
-            'monitor': None,
             'converged_rate': None,
             'atol': atol,
-            'rtol': 1.0e-100,
-            'stol': 1.0e-100,
+            'rtol': 0,
+            'stol': 0,
         }
     }
 
@@ -71,11 +67,12 @@ def form_mass(u, v):
     aaosolver.solve()
 
     # check residual
-
     aaoform.assemble(func=aaofunc)
-    residual = fd.norm(aaoform.F.function)
 
-    assert residual < atol, "GMRES should converge to prescribed tolerance even without preconditioning"
+    residual = aaoform.F.cofunction.riesz_representation(
+        'l2', solver_options={'function_space': aaofunc.function.function_space()})
+
+    assert fd.norm(residual) < atol, "GMRES should converge to prescribed tolerance even without preconditioning"
 
 
 def test_solve_heat_equation_circulantpc():
@@ -123,21 +120,14 @@ def form_mass(u, v):
     atol = 1.0e-8
     solver_parameters = {
         'snes_type': 'ksponly',
-        'snes': {
-            'monitor': None,
-            'converged_reason': None,
-            'atol': atol,
-            'rtol': 1.0e-100,
-            'stol': 1.0e-100,
-        },
         'ksp_type': 'gmres',
         'mat_type': 'matfree',
         'ksp': {
             'monitor': None,
             'converged_rate': None,
             'atol': atol,
-            'rtol': 1.0e-100,
-            'stol': 1.0e-100,
+            'rtol': 0,
+            'stol': 0,
         },
         'pc_type': 'python',
         'pc_python_type': 'asQ.CirculantPC',
@@ -149,11 +139,12 @@ def form_mass(u, v):
     aaosolver.solve()
 
     # check residual
-
     aaoform.assemble(func=aaofunc)
-    residual = fd.norm(aaoform.F.function)
 
-    assert residual < atol, "GMRES should converge to prescribed tolerance with CirculantPC"
+    residual = aaoform.F.cofunction.riesz_representation(
+        'l2', solver_options={'function_space': aaofunc.function.function_space()})
+
+    assert fd.norm(residual) < atol, "GMRES should converge to prescribed tolerance with CirculantPC"
 
 
 extruded = [pytest.param(False, id="standard_mesh"),
@@ -174,7 +165,8 @@ def test_solve_mixed_wave_equation(extrude, cpx_type):
     """
 
     # space-time parallelism
-    nslices = fd.COMM_WORLD.size//2
+    nspace_ranks = 2
+    nslices = fd.COMM_WORLD.size//nspace_ranks
     slice_length = 2
 
     time_partition = tuple((slice_length for _ in range(nslices)))
@@ -186,8 +178,8 @@ def test_solve_mixed_wave_equation(extrude, cpx_type):
         mesh1D = fd.UnitIntervalMesh(nx, comm=ensemble.comm)
         mesh = fd.ExtrudedMesh(mesh1D, nx, layer_height=1./nx)
 
-        horizontal_degree = 1
-        vertical_degree = 1
+        horizontal_degree = 0
+        vertical_degree = 0
         S1 = fd.FiniteElement("CG", fd.interval, horizontal_degree+1)
         S2 = fd.FiniteElement("DG", fd.interval, horizontal_degree)
 
@@ -228,7 +220,7 @@ def test_solve_mixed_wave_equation(extrude, cpx_type):
 
     def form_function(uu, up, vu, vp, t):
         return (fd.div(vu) * up + c * fd.sqrt(fd.inner(uu, uu) + eps) * fd.inner(uu, vu)
-                - fd.div(uu) * vp) * fd.dx
+                - fd.div(uu) * vp) * fd.dx(degree=4)
 
     def form_mass(uu, up, vu, vp):
         return (fd.inner(uu, vu) + up * vp) * fd.dx
@@ -245,14 +237,14 @@ def form_mass(uu, up, vu, vp):
             'monitor': None,
             'converged_reason': None,
             'atol': atol,
-            'rtol': 1e-100,
-            'stol': 1e-100,
+            'rtol': 0,
+            'stol': 0,
         },
         'mat_type': 'matfree',
         'ksp_type': 'preonly',
         'ksp': {
             'monitor': None,
-            'converged_reason': None,
+            'converged_rate': None,
         },
         'pc_type': 'python',
         'pc_python_type': 'asQ.CirculantPC',
@@ -271,7 +263,7 @@ def form_mass(uu, up, vu, vp):
     aaosolver.solve()
 
     # check residual
-    aaoform.assemble(func=aaofunc)
-    residual = fd.norm(aaoform.F.function)
+    residual = aaoform.F.cofunction.riesz_representation(
+        'l2', solver_options={'function_space': aaofunc.function.function_space()})
 
-    assert (residual < atol), "GMRES should converge to prescribed tolerance with CirculantPC"
+    assert fd.norm(residual) < atol, "GMRES should converge to prescribed tolerance with CirculantPC"
diff --git a/tests/preconditioners/test_allatonce_pcs.py b/tests/preconditioners/test_allatonce_pcs.py
index 57499122..edac3f6c 100644
--- a/tests/preconditioners/test_allatonce_pcs.py
+++ b/tests/preconditioners/test_allatonce_pcs.py
@@ -259,7 +259,8 @@ def test_slicejacobipc_slice(nsteps):
         'slice_jacobi_nsteps': nsteps,
         'slice_jacobi_state': 'linear',
         'slice_jacobi_slice': {
-            'ksp_type': 'fgmres',
+            'ksp_converged_rate': None,
+            'ksp_type': 'richardson',
             'ksp_rtol': 1e-15,
             'pc_type': 'python',
             'pc_python_type': 'asQ.CirculantPC',
diff --git a/tests/test_paradiag.py b/tests/test_paradiag.py
index 1ac24b02..7e4e9c77 100644
--- a/tests/test_paradiag.py
+++ b/tests/test_paradiag.py
@@ -465,14 +465,20 @@ def test_steady_swe_miniapp():
                   pytest.param(True, id="extruded_mesh",
                                marks=pytest.mark.xfail(reason="fd.split for TensorProductElements in unmixed spaces broken by ufl PR#122."))]
 
+cpx_types = [pytest.param('vector', id="vector_cpx"),
+             pytest.param('mixed', id="mixed_cpx")]
+
 
 @pytest.mark.parallel(nprocs=6)
 @pytest.mark.parametrize("bc_opt", bc_opts)
 @pytest.mark.parametrize("extruded", extruded_mixed)
-def test_solve_para_form(bc_opt, extruded):
-    # checks that the all-at-once system is the same as solving
-    # timesteps sequentially using the NONLINEAR heat equation as an example by
-    # solving the all-at-once system and comparing with the sequential
+@pytest.mark.parametrize("cpx_type", cpx_types)
+def test_solve_para_form(bc_opt, extruded, cpx_type):
+    """
+    Checks that solving the all-at-once system is the same as solving
+    timesteps sequentially by solving the all-at-once system for a
+    nonlinear heat equation and comparing with the sequential solution.
+    """
 
     # set up the ensemble communicator for space-time parallelism
     nslices = fd.COMM_WORLD.size//2
@@ -514,7 +520,8 @@ def test_solve_para_form(bc_opt, extruded):
             'ksp_type': 'preonly',
             'pc_type': 'lu',
             'pc_factor_mat_solver_type': 'mumps'
-        }
+        },
+        'circulant_complex_proxy': cpx_type
     }
 
     def form_function(u, v, t):
@@ -587,7 +594,7 @@ def test_diagnostics():
 
     diag_sparameters = {
         'snes_converged_reason': None,
-        'ksp_converged_reason': None,
+        'ksp_converged_rate': None,
         'ksp_type': 'preonly',
         'pc_type': 'python',
         'pc_python_type': 'asQ.CirculantPC',
diff --git a/tests/utils/test_hybridisation.py b/tests/utils/test_hybridisation.py
index a9b88c2e..b8d1bfe5 100644
--- a/tests/utils/test_hybridisation.py
+++ b/tests/utils/test_hybridisation.py
@@ -5,8 +5,10 @@
 from utils import units
 from utils.hybridisation import HybridisedSCPC  # noqa: F401
 import numpy as np
+import pytest
 
 
+@pytest.mark.xfail(reason="HybridisationSCPC not yet fixed after assemble(zero_bc_nodes) change (firedrake PR#3947)")
 def test_real_hybridisation():
     theta = 0.5
     dt = units.hour
@@ -87,6 +89,7 @@ def form_function(u, h, v, q, t=None):
     assert converged_reason == 2, "rtol should be almost machine precision for direct solve"
 
 
+@pytest.mark.xfail(reason="HybridisationSCPC not yet fixed after assemble(zero_bc_nodes) change (firedrake PR#3947)")
 def test_complex_hybridisation():
     from scipy.fft import fft
     from asQ.complex_proxy import vector as cpx
diff --git a/utils/diagnostics.py b/utils/diagnostics.py
index 3d397d55..65bf2490 100644
--- a/utils/diagnostics.py
+++ b/utils/diagnostics.py
@@ -14,13 +14,15 @@ def convective_cfl_calculator(mesh,
     DG0 = fd.FunctionSpace(mesh, "DG", 0)
     v = fd.TestFunction(DG0)
 
-    cell_volume = fd.Function(DG0, name="Cell volume")
-    cell_flux = fd.Function(DG0, name="Cell surface flux")
+    cell_volume = fd.Cofunction(DG0.dual(), name="Cell volume")
+    cell_flux = fd.Cofunction(DG0.dual(), name="Cell surface flux")
     cfl = fd.Function(DG0, name=name)
 
     # mesh volume
     One = fd.Function(DG0, name="One").assign(1)
     fd.assemble(One*v*fd.dx, tensor=cell_volume)
+    cell_volume = cell_volume.riesz_representation(
+        'l2', solver_options={'function_space': DG0})
 
     # choose correct facet integral for mesh type
     if mesh.extruded:
@@ -43,9 +45,11 @@ def cfl_calc(u, dt):
             + un*v*ds
         )
         fd.assemble(cell_flux_form, tensor=cell_flux)
+        cflx = cell_flux.riesz_representation(
+            'l2', solver_options={'function_space': DG0})
 
         dT = fd.Constant(dt)
-        cfl.interpolate(dT*cell_flux/cell_volume)
+        cfl.interpolate(dT*cflx/cell_volume)
         return cfl
 
     return cfl_calc
diff --git a/utils/mg.py b/utils/mg.py
index 39ee0dc2..2a5daa06 100644
--- a/utils/mg.py
+++ b/utils/mg.py
@@ -42,7 +42,7 @@ def _register_mesh(self, mesh):
         if not hasattr(mesh, "transfer_coordinates"):
             msg = "ManifoldTransferManager requires mesh to have " \
                   + "`transfer_coordinates` stashed on each member" \
-                  + " of heirarchy."
+                  + " of hierarchy."
             raise AttributeError(msg)
         if not hasattr(mesh, "original_coordinates"):
             mesh.original_coordinates = fd.Function(mesh.coordinates)