diff --git a/core/include/engine/Method_MC.hpp b/core/include/engine/Method_MC.hpp
index a3af89761..9c174d19e 100644
--- a/core/include/engine/Method_MC.hpp
+++ b/core/include/engine/Method_MC.hpp
@@ -28,8 +28,15 @@ namespace Engine
         // Solver_Iteration represents one iteration of a certain Solver
         void Iteration() override;
 
+        // Performs a trial move of a single spin
+        bool Metropolis_Spin_Trial(int ispin, const vectorfield & spins_old, vectorfield & spins_new, const scalar & rng1, const scalar & rng2, const scalar & rng3, const scalar & cos_cone_angle);
+
+        void Block_Decomposition();
+        
         // Metropolis iteration with adaptive cone radius
         void Metropolis(const vectorfield & spins_old, vectorfield & spins_new);
+        // Parallel MC
+        void Parallel_Metropolis(const vectorfield & spins_old, vectorfield & spins_new);
 
         // Save the current Step's Data: spins and energy
         void Save_Current(std::string starttime, int iteration, bool initial=false, bool final=false) override;
@@ -59,6 +66,12 @@ namespace Engine
 
         // Random vector array
         vectorfield xi;
+
+        // Values for block decompositon
+        std::array<int, 3> block_size_min;
+        std::array<int, 3> n_blocks;
+        std::array<int, 3> rest;
+        std::vector<std::mt19937> prng_vec = {};
     };
 }
 
diff --git a/core/src/engine/Method_MC.cpp b/core/src/engine/Method_MC.cpp
index 1d873b793..6c84bffaa 100644
--- a/core/src/engine/Method_MC.cpp
+++ b/core/src/engine/Method_MC.cpp
@@ -45,6 +45,7 @@ namespace Engine
         this->cone_angle = Constants::Pi * this->parameters_mc->metropolis_cone_angle / 180.0;
         this->n_rejected = 0;
         this->acceptance_ratio_current = this->parameters_mc->acceptance_ratio_target;
+        Block_Decomposition();
     }
 
     // This implementation is mostly serial as parallelization is nontrivial
@@ -60,10 +61,74 @@ namespace Engine
 
         // TODO: add switch between Metropolis and heat bath
         // One Metropolis step
-        Metropolis(spins_old, spins_new);
+        Parallel_Metropolis(spins_old, spins_new);
+        // Metropolis(spins_old, spins_new);
         Vectormath::set_c_a(1, spins_new, spins_old);
     }
 
+    bool Method_MC::Metropolis_Spin_Trial(int ispin, const vectorfield & spins_old, vectorfield & spins_new, const scalar & rng1, const scalar & rng2, const scalar & rng3, const scalar & cos_cone_angle)
+    {
+        Matrix3 local_basis;
+        const Vector3 e_z{0,0,1};
+        const scalar kB_T = Constants::k_B * this->parameters_mc->temperature;
+
+        // Calculate local basis for the spin
+        if( std::abs(spins_old[ispin].z()) < 1-1e-10 )
+        {
+            local_basis.col(2) = spins_old[ispin];
+            local_basis.col(0) = (local_basis.col(2).cross(e_z)).normalized();
+            local_basis.col(1) = local_basis.col(2).cross(local_basis.col(0));
+        } else
+        {
+            local_basis = Matrix3::Identity();
+        }
+
+        // Rotation angle between 0 and cone_angle degrees
+        scalar costheta = 1 - (1 - cos_cone_angle) * rng1;
+
+        scalar sintheta = std::sqrt(1 - costheta*costheta);
+
+        // Random distribution of phi between 0 and 360 degrees
+        scalar phi = 2*Constants::Pi * rng2;
+
+        Vector3 local_spin_new{ sintheta * std::cos(phi),
+                                sintheta * std::sin(phi),
+                                costheta };
+
+        // New spin orientation in regular basis
+        spins_new[ispin] = local_basis * local_spin_new;
+
+        // Energy difference of configurations with and without displacement
+        scalar Eold  = this->systems[0]->hamiltonian->Energy_Single_Spin(ispin, spins_old);
+        scalar Enew  = this->systems[0]->hamiltonian->Energy_Single_Spin(ispin, spins_new);
+        scalar Ediff = Enew-Eold;
+
+        // Metropolis criterion: reject the step if energy rose
+        if( Ediff > 1e-14 )
+        {
+            if( this->parameters_mc->temperature < 1e-12 )
+            {
+                // Restore the spin
+                spins_new[ispin] = spins_old[ispin];
+                return false;
+            }
+            else
+            {
+                // Exponential factor
+                scalar exp_ediff    = std::exp( -Ediff/kB_T );
+                // Only reject if random number is larger than exponential
+                if( exp_ediff < rng3 )
+                {
+                    // Restore the spin
+                    spins_new[ispin] = spins_old[ispin];
+                    // Counter for the number of rejections
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
     // Simple metropolis step
     void Method_MC::Metropolis(const vectorfield & spins_old, vectorfield & spins_new)
     {
@@ -111,7 +176,7 @@ namespace Engine
                 if( this->parameters_mc->metropolis_step_cone )
                 {
                     // Calculate local basis for the spin
-                    if( spins_old[ispin].z() < 1-1e-10 )
+                    if( std::abs(spins_old[ispin].z()) < 1-1e-10 )
                     {
                         local_basis.col(2) = spins_old[ispin];
                         local_basis.col(0) = (local_basis.col(2).cross(e_z)).normalized();
@@ -191,6 +256,203 @@ namespace Engine
         }
     }
 
+    void Method_MC::Block_Decomposition()
+    {
+        /* 
+        Decompose the system into blocks of spins
+        Each Block may only interact with it's nearest neighbour.
+        Therefore this decomposition only works for short-ranged interactions (all except DDI)
+        It is further important that the number of blocks in each dimension is either 1 or an even number.
+        If this is not fulfilled the block can interact with its nearest periodic neighbour, for open boundary conditions it does not matter.
+        For example, a valid decomposition of a 9x4x1 system, with up to *next-nearest* neighbour interactions, could look like this:
+            ┌───┬───┬───┬─────┐
+            │1 1│2 2│1 1│2 2 2│
+            │   │   │   │     │
+            │1 1│2 2│1 1│2 2 2│
+            ├───┼───┼───┼─────┤
+            │3 3│4 4│3 3│4 4 4│
+            │   │   │   │     │
+            │3 3│4 4│3 3│4 4 4│
+            └───┴───┴───┴─────┘
+        The decomposition enables us to run the MC algorithm in four serial phases (1 to 4), while each phase is parallelized.
+        All boxes belonging to one specific phase, do not interact directly between each other, therefore we can perform their MC trial moves in parallel.
+        The number of phases is 2^dim and the number of usable threads is the number of boxes per phase.
+        In this example the usable number of threads would be 2.
+        If the size of the boxes does not perfectly divide n_cells, the rest is added or substracted at the edges.
+        */
+
+        // Initialize
+        n_blocks = {0,0,0};
+        block_size_min = {1,1,1}; // Require at least one as block size
+        rest = {0,0,0};
+
+        if (this->systems[0]->hamiltonian->Name() != "Heisenberg")
+        {
+            return;
+        }
+
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("Performing block decomposition for parallel Metropolis algorithm"));
+        const auto & geom = this->systems[0]->geometry;
+        const auto & ham  = (const Hamiltonian_Heisenberg *) this->systems[0]->hamiltonian.get();
+  
+        for(auto & p : ham->dmi_pairs)
+        {
+            for(int i=0; i<3; i++)
+                block_size_min[i] = std::max(block_size_min[i], std::abs(p.translations[i]));
+        }
+        for(auto & p : ham->exchange_pairs)
+        {
+            for(int i=0; i<3; i++)
+                block_size_min[i] = std::max(block_size_min[i], std::abs(p.translations[i]));
+        }
+        for(auto & q : ham->quadruplets)
+        {
+            for(int i=0; i<3; i++)
+            {
+                block_size_min[i] = std::max(block_size_min[i], std::abs(q.d_j[i]));
+                block_size_min[i] = std::max(block_size_min[i], std::abs(q.d_k[i]));
+                block_size_min[i] = std::max(block_size_min[i], std::abs(q.d_l[i]));
+            }
+        }
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("    Block size from interactions = ({}, {}, {})", block_size_min[0], block_size_min[1], block_size_min[2]));
+        
+        // If the number of blocks is uneven for a certain direction, we increase the block size until it is even
+        for(int i=0; i<3; i++)
+        {
+            n_blocks[i] = geom->n_cells[i] / block_size_min[i];
+            while(n_blocks[i] % 2 != 0 )
+            {
+                n_blocks[i] = geom->n_cells[i] / ++block_size_min[i];
+            }
+        }
+
+        for(int i=0; i<3; i++)
+        {
+            if(block_size_min[i] <= geom->n_cells[i])
+            {
+                rest[i] = geom->n_cells[i] % block_size_min[i];
+            } else { // If the block size is larger than n_cells, we use n_blocks=1
+                n_blocks[i] = 1;
+                block_size_min[i] = geom->n_cells[i];
+            }
+        }
+
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("    Actual block size = ({}, {}, {})", block_size_min[0], block_size_min[1], block_size_min[2]));
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("    N_blocks          = ({}, {}, {})", n_blocks[0], n_blocks[1], n_blocks[2]));
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("    Rest              = ({}, {}, {})", rest[0], rest[1], rest[2]));
+        int Nthreads = std::max(1,n_blocks[0]/2) * std::max(1,n_blocks[1]/2) * std::max(1,n_blocks[2]/2);
+        Log.Send(Utility::Log_Level::Info, Utility::Log_Sender::MC, fmt::format("    Supports up to max. {} threads.", Nthreads));
+    }
+
+    // Simple metropolis step
+    void Method_MC::Parallel_Metropolis(const vectorfield & spins_old, vectorfield & spins_new)
+    {
+        const auto & geom = this->systems[0]->geometry;
+
+        scalar diff = 0.01;
+        // Cone angle feedback algorithm
+        if( this->parameters_mc->metropolis_step_cone && this->parameters_mc->metropolis_cone_adaptive )
+        {
+            this->acceptance_ratio_current = 1 - (scalar)this->n_rejected / (scalar)this->nos_nonvacant;
+
+            if( (this->acceptance_ratio_current < this->parameters_mc->acceptance_ratio_target) && (this->cone_angle > diff) )
+                this->cone_angle -= diff;
+
+            if( (this->acceptance_ratio_current > this->parameters_mc->acceptance_ratio_target) && (this->cone_angle < Constants::Pi-diff) )
+                this->cone_angle += diff;
+
+            this->parameters_mc->metropolis_cone_angle = this->cone_angle * 180.0 / Constants::Pi;
+        }
+
+        this->n_rejected = 0;
+        int n_rejected = 0;
+        scalar cos_cone_angle = std::cos(this->cone_angle);
+
+        auto distribution = std::uniform_real_distribution<scalar>(0, 1);
+
+        // Number of threads
+        int nt = 1;
+        #ifdef SPIRIT_USE_OPENMP
+            nt = omp_get_max_threads();
+        #endif
+
+        // We make nt copies of our number generator, so that each thread has its own
+        if(nt != prng_vec.size())
+        {
+            prng_vec.resize(nt);
+            for(int i=0; i<prng_vec.size(); i++)
+            {
+                prng_vec[i] = std::mt19937(this->parameters_mc->rng_seed * (i+1));
+            }
+        }
+
+        // There are up to (2^dim) phases in the algorithm, which have to be executed serially
+        for(int phase_c = 0; phase_c<2; phase_c++)
+        {
+            for(int phase_b = 0; phase_b<2; phase_b++)
+            {
+                for(int phase_a = 0; phase_a<2; phase_a++)
+                {
+                    // In each phase we iterate over blocks of spins, which are **next-nearest** neighbour blocks
+                    // Thus there is no dependence of the spin moves and we can parallelize over the blocks in a phase
+                    #pragma omp parallel for collapse(3) private(distribution) reduction(+:n_rejected) 
+                    for(int block_c = phase_c; block_c < n_blocks[2]; block_c+=2)
+                    {
+                        for(int block_b = phase_b; block_b < n_blocks[1]; block_b+=2)
+                        {
+                            for(int block_a = phase_a; block_a < n_blocks[0]; block_a+=2)
+                            {
+                                int block_size_c = (block_c == n_blocks[2] - 1) ? block_size_min[2] + rest[2] : block_size_min[2]; // Account for the remainder of division (n_cells[i] / block_size_min[i]) by increasing the block size at the edges
+                                int block_size_b = (block_b == n_blocks[1] - 1) ? block_size_min[1] + rest[1] : block_size_min[1];
+                                int block_size_a = (block_a == n_blocks[0] - 1) ? block_size_min[0] + rest[0] : block_size_min[0];
+
+                                // Iterate over the current block
+                                for(int cc=0; cc < block_size_c; cc++)
+                                {
+                                    for(int bb=0; bb < block_size_b; bb++)
+                                    {
+                                        for(int aa=0; aa < block_size_a; aa++)
+                                        {
+                                            for(int ibasis=0; ibasis < geom->n_cell_atoms; ibasis++)
+                                            {
+                                                int a = block_a * block_size_min[0] + aa; // We do not have to worry about the remainder of the division here, it is contained in the 'aa'/'bb'/'cc' offset
+                                                int b = block_b * block_size_min[1] + bb;
+                                                int c = block_c * block_size_min[2] + cc;
+
+                                                // Compute the current spin idx
+                                                int ispin = ibasis + geom->n_cell_atoms * ( a + geom->n_cells[0] * (b + geom->n_cells[1] * c));
+
+                                                // Perform the Metropolis trial step
+                                                if( Vectormath::check_atom_type(this->systems[0]->geometry->atom_types[ispin]) )
+                                                {
+                                                    int tid = 0;
+                                                    #ifdef SPIRIT_USE_OPENMP
+                                                    tid = omp_get_thread_num();
+                                                    #endif
+
+                                                    scalar rng1 = distribution(prng_vec[tid]);
+                                                    scalar rng2 = distribution(prng_vec[tid]);
+                                                    scalar rng3 = distribution(prng_vec[tid]);
+                                                    if(!Metropolis_Spin_Trial(ispin, spins_old, spins_new, rng1, rng2, rng3, cos_cone_angle))
+                                                    {
+                                                        n_rejected++;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        this->n_rejected = n_rejected;
+        // std::cout << n_rejected << "\n";
+        // Log.Send(Utility::Log_Level::Error, Utility::Log_Sender::MC, fmt::format( "n_rejected = {}",n_rejected ));
+    }
+
     // TODO:
     // Implement heat bath algorithm, see Y. Miyatake et al, J Phys C: Solid State Phys 19, 2539 (1986)
     // void Method_MC::HeatBath(const vectorfield & spins_old, vectorfield & spins_new)